In [34]:
import numpy as np
import pandas as pd
import ast
import re
import json

from pymongo import MongoClient

In [35]:
animes = pd.read_csv("clean_data/animes.csv")
profiles = pd.read_csv("clean_data/profiles.csv")
reviews = pd.read_csv("clean_data/reviews.csv")

In [36]:
animes = animes.where(pd.notnull(animes), None)
profiles = profiles.where(pd.notnull(profiles), None)
reviews = reviews.where(pd.notnull(reviews), None)

In [37]:
len(animes["uid"].unique()), animes.shape

(13678, (13678, 11))

In [38]:
len(profiles["profile"].unique()), profiles.shape

(19254, (19254, 4))

In [39]:
len(reviews["uid"].unique()), reviews.shape

(5000, (5000, 6))

In [40]:
def string_list_to_list(string_list):
    
    string_list = string_list.strip("[]")
    
    
    if len(string_list) == 0:
        return []
    
    # Convert the string to a list
    string_list = [item.strip(" '") for item in string_list.split(",")]
    
    return string_list

In [41]:
animes.shape

(13678, 11)

In [42]:
animes.isna().sum()

uid            0
title          0
synopsis       0
genre          0
episodes       0
members        0
popularity     0
ranked         0
score          0
start_years    0
end_years      0
dtype: int64

In [43]:
animes.head()

Unnamed: 0,uid,title,synopsis,genre,episodes,members,popularity,ranked,score,start_years,end_years
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...",25.0,489888,141,25.0,8.82,2015.0,2016.0
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...",22.0,995473,28,24.0,8.83,2014.0,2015.0
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...",13.0,581663,98,23.0,8.83,2017.0,2017.0
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...",64.0,1615084,4,1.0,9.23,2009.0,2010.0
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']",1.0,214621,502,22.0,8.83,2017.0,2017.0


In [44]:
animes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13678 entries, 0 to 13677
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   uid          13678 non-null  int64  
 1   title        13678 non-null  object 
 2   synopsis     13678 non-null  object 
 3   genre        13678 non-null  object 
 4   episodes     13678 non-null  float64
 5   members      13678 non-null  int64  
 6   popularity   13678 non-null  int64  
 7   ranked       13678 non-null  float64
 8   score        13678 non-null  float64
 9   start_years  13678 non-null  float64
 10  end_years    13678 non-null  float64
dtypes: float64(5), int64(3), object(3)
memory usage: 1.1+ MB


In [45]:
animes["genre"] = animes["genre"].apply(string_list_to_list)

In [46]:
animes["genre"][0]

['Comedy', 'Sports', 'Drama', 'School', 'Shounen']

In [47]:
profiles.shape

(19254, 4)

In [48]:
profiles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19254 entries, 0 to 19253
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   profile          19254 non-null  object 
 1   gender           19254 non-null  object 
 2   birthday         19254 non-null  float64
 3   favorites_anime  19254 non-null  object 
dtypes: float64(1), object(3)
memory usage: 601.8+ KB


In [49]:
profiles["favorites_anime"] = profiles["favorites_anime"].apply(lambda x: [int(i) for i in string_list_to_list(x)])

In [50]:
profiles["favorites_anime"][0]

[33352,
 25013,
 5530,
 33674,
 1482,
 269,
 18245,
 2904,
 27899,
 17074,
 12291,
 226,
 28851,
 8525,
 6594,
 4981,
 1698,
 457,
 34618]

In [51]:
profiles.head()

Unnamed: 0,profile,gender,birthday,favorites_anime
0,DesolatePsyche,Male,1994.0,"[33352, 25013, 5530, 33674, 1482, 269, 18245, ..."
1,baekbeans,Female,2000.0,"[11061, 31964, 853, 20583, 918, 9253, 34599, 3..."
2,aManOfCulture99,Male,1999.0,"[4181, 7791, 9617, 5680, 2167, 4382, 849, 3518..."
3,NIGGER_BONER,Male,1985.0,"[11061, 30, 6594, 28701, 10087, 6746, 918, 153..."
4,jchang,Male,1992.0,"[846, 2904, 5114, 2924, 72]"


In [52]:
reviews.shape

(5000, 6)

In [53]:
reviews.head()

Unnamed: 0,uid,profile,anime_uid,text,score,scores
0,255938,DesolatePsyche,34096,"First things first. My ""reviews"" system is exp...",8,"{'Overall': '8', 'Story': '8', 'Animation': '8..."
1,259117,baekbeans,34599,Let me start off by saying that Made in Abyss ...,10,"{'Overall': '10', 'Story': '10', 'Animation': ..."
2,291149,aManOfCulture99,4181,Kyoto Animations greatest strength is being ab...,10,"{'Overall': '10', 'Story': '10', 'Animation': ..."
3,140903,NIGGER_BONER,2904,Code Geass R2 is sitting pretty on myanimelist...,8,"{'Overall': '8', 'Story': '7', 'Animation': '9..."
4,23791,jchang,2904,code geass is one of those series that everybo...,10,"{'Overall': '10', 'Story': '10', 'Animation': ..."


In [54]:
reviews["scores"][0][1:-1]

"'Overall': '8', 'Story': '8', 'Animation': '8', 'Sound': '10', 'Character': '9', 'Enjoyment': '8'"

In [55]:
def convert_to_json(string):
    string = string.replace("'", '"')
    json_obj = json.loads(string)
    
    # Convert the values to integers
    for key in json_obj:
        json_obj[key] = int(json_obj[key])
    
    return json_obj

In [56]:
convert_to_json(reviews["scores"][0])

{'Overall': 8,
 'Story': 8,
 'Animation': 8,
 'Sound': 10,
 'Character': 9,
 'Enjoyment': 8}

In [57]:
reviews["scores"] = reviews["scores"].apply(convert_to_json)

In [58]:
reviews["scores"][0]

{'Overall': 8,
 'Story': 8,
 'Animation': 8,
 'Sound': 10,
 'Character': 9,
 'Enjoyment': 8}

In [59]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   uid        5000 non-null   int64 
 1   profile    5000 non-null   object
 2   anime_uid  5000 non-null   int64 
 3   text       5000 non-null   object
 4   score      5000 non-null   int64 
 5   scores     5000 non-null   object
dtypes: int64(3), object(3)
memory usage: 234.5+ KB


In [60]:
print("Columns in animes table:")
print(animes.columns)

print("\nColumns in profiles table:")
print(profiles.columns)

print("\nColumns in reviews table:")
print(reviews.columns)

Columns in animes table:
Index(['uid', 'title', 'synopsis', 'genre', 'episodes', 'members',
       'popularity', 'ranked', 'score', 'start_years', 'end_years'],
      dtype='object')

Columns in profiles table:
Index(['profile', 'gender', 'birthday', 'favorites_anime'], dtype='object')

Columns in reviews table:
Index(['uid', 'profile', 'anime_uid', 'text', 'score', 'scores'], dtype='object')


In [61]:
reviews["scores"][0]

{'Overall': 8,
 'Story': 8,
 'Animation': 8,
 'Sound': 10,
 'Character': 9,
 'Enjoyment': 8}

In [62]:
reviews.to_dict('records')[0]["scores"]

{'Overall': 8,
 'Story': 8,
 'Animation': 8,
 'Sound': 10,
 'Character': 9,
 'Enjoyment': 8}

In [63]:
profiles.to_dict('records')[0]

{'profile': 'DesolatePsyche',
 'gender': 'Male',
 'birthday': 1994.0,
 'favorites_anime': [33352,
  25013,
  5530,
  33674,
  1482,
  269,
  18245,
  2904,
  27899,
  17074,
  12291,
  226,
  28851,
  8525,
  6594,
  4981,
  1698,
  457,
  34618]}

In [64]:
type(profiles.to_dict('records')[0]["favorites_anime"])

list

In [65]:
# Replace NaN with None
# animes = animes.where(pd.notnull(animes), None)
# profiles = profiles.where(pd.notnull(profiles), None)
# reviews = reviews.where(pd.notnull(reviews), None)

# Convert scores column to JSON strings
#reviews['scores'] = reviews['scores'].apply(json.dumps)

# Connect to MongoDB
client = MongoClient("mongodb://localhost:27018/")
db = client["anime_database"]


db.animes.drop()
db.profiles.drop()
db.reviews.drop()

animesCollection = db["animes"]
animesCollection.insert_many(animes.to_dict('records'))
profilesCollection = db["profiles"]
profilesCollection.insert_many(profiles.to_dict('records'))
reviewsCollection = db["reviews"]
reviewsCollection.insert_many(reviews.to_dict('records'))

InsertManyResult([ObjectId('666cc0d11a4649d791756dc1'), ObjectId('666cc0d11a4649d791756dc2'), ObjectId('666cc0d11a4649d791756dc3'), ObjectId('666cc0d11a4649d791756dc4'), ObjectId('666cc0d11a4649d791756dc5'), ObjectId('666cc0d11a4649d791756dc6'), ObjectId('666cc0d11a4649d791756dc7'), ObjectId('666cc0d11a4649d791756dc8'), ObjectId('666cc0d11a4649d791756dc9'), ObjectId('666cc0d11a4649d791756dca'), ObjectId('666cc0d11a4649d791756dcb'), ObjectId('666cc0d11a4649d791756dcc'), ObjectId('666cc0d11a4649d791756dcd'), ObjectId('666cc0d11a4649d791756dce'), ObjectId('666cc0d11a4649d791756dcf'), ObjectId('666cc0d11a4649d791756dd0'), ObjectId('666cc0d11a4649d791756dd1'), ObjectId('666cc0d11a4649d791756dd2'), ObjectId('666cc0d11a4649d791756dd3'), ObjectId('666cc0d11a4649d791756dd4'), ObjectId('666cc0d11a4649d791756dd5'), ObjectId('666cc0d11a4649d791756dd6'), ObjectId('666cc0d11a4649d791756dd7'), ObjectId('666cc0d11a4649d791756dd8'), ObjectId('666cc0d11a4649d791756dd9'), ObjectId('666cc0d11a4649d791756d

In [66]:
reviews.to_dict('records')[0]["scores"]

{'Overall': 8,
 'Story': 8,
 'Animation': 8,
 'Sound': 10,
 'Character': 9,
 'Enjoyment': 8}