# Import libraries to run the project

In [None]:
import sys
!pip install -q rdflib networkx matplotlib
!{sys.executable} -m pip install rdflib networkx matplotlib --user
import pandas as pd #for handling csv and csv contents
from rdflib import Graph, Literal, RDF,RDFS, URIRef, Namespace #basic RDF handling
from rdflib.namespace import FOAF , XSD #most common namespaces
import urllib.parse #for parsing strings to URI's

# Load anime.csv data

In [None]:
url = 'https://raw.githubusercontent.com/csepulvedaa/Proyecto-CC7220-G12/main/anime.csv'
df=pd.read_csv(url,sep=",",quotechar='"')
# df # uncomment to check for contents

# Load rating.csv data

In [None]:
url2 = 'https://raw.githubusercontent.com/csepulvedaa/Proyecto-CC7220-G12/main/rating.csv'
df2=pd.read_csv(url2,sep=",",quotechar='"')
df2_filtered = df2[df2.rating > -1]
df2_mean = df.groupby(['anime_id'], as_index=False)['rating'].mean()
# df2  # uncomment to check for contents

# Main code to transform data
### Auxiliar function

In [None]:
# Aux function to get values from df2_mean
def get_user_mean_rate(id):
    try:
        id = int(id)
        return df2_mean.iat[id,1]
    except:
        return -1

### Define triples for the rdf graph

In [None]:
g = Graph()
schema_path = 'http://schema.org/'  # Paths to uri
anime_path = 'http://myanimelist.net/anime/'
schema = Namespace(schema_path)  # Prefix declaration
anime = Namespace(anime_path)

i=0
genre_list = []  # Save distinct genres for column 'genre'
type_list = []  # Save dictinct types for column 'type'

for index, row in df.iterrows():
    g.add((URIRef(anime + str(row['anime_id'])), RDF.type, URIRef(schema + 'Anime')))
    g.add((URIRef(anime + str(row['anime_id'])), URIRef(anime + 'name'), Literal(row['name'], datatype=XSD.string)))
    
    if not pd.isna(row['episodes']):
        g.add((URIRef(anime + str(row['anime_id'])), URIRef(schema + 'episodes'), Literal(row['episodes'], datatype=XSD.integer)))
    if not pd.isna(row['rating']):
        g.add((URIRef(anime + str(row['anime_id'])), URIRef(schema + 'rating'), Literal(row['rating'], datatype=XSD.integer)))
    if not pd.isna(row['members']):
        g.add((URIRef(anime + str(row['anime_id'])), URIRef(schema + 'members'), Literal(row['members'], datatype=XSD.integer)))
    if not pd.isna(row['type']):
        g.add((URIRef(anime + str(row['anime_id'])), URIRef(schema + 'type'), URIRef(schema + str(row['type']))))
    
    # Join rating dataset
    user_mean_rate = get_user_mean_rate(row['anime_id'])
    if user_mean_rate > 0:
        g.add((URIRef(anime + str(row['anime_id'])), URIRef(schema + 'user_mean_rate'), Literal(user_mean_rate, datatype=XSD.double)))
    
    if not pd.isna(row['genre']) and row['type'] not in type_list:
        type_list.append(row['type'])
    
    if not pd.isna(row['genre']):
        genres = row['genre'].split(',')  # Split to get every genre for an anime
        for a_genre in genres:
            genre = a_genre.strip()  # deal with blank spaces after split
            genre = genre.replace(' ','_')  # deal with spaces between words
            g.add((URIRef(anime + str(row['anime_id'])), URIRef(schema + 'genre'), URIRef(schema + str(genre))))
            if genre not in genre_list:
                genre_list.append(genre)

for a_genre in genre_list:  # Add Genre 
    g.add((URIRef(schema + str(a_genre)), RDF.type, URIRef(schema + 'Genre')))

for a_type in type_list:  # Add Type 
    g.add((URIRef(schema + str(a_type)), RDF.type, URIRef(schema + 'Type')))

g.serialize(destination='output.txt', format='turtle')


# Result
The output is an archive that contains the data with the RDF structure. Then its ready to make queries on a sparQL endpoint.