### Beginning of the Assignment - exploration

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
#pd.set_option('max_columns', 200)

### PHIL DATA SET

In [None]:
character_nicknames_df = pd.read_csv('datasets/PHIL DATA SET/character_nicknames.csv')

In [None]:
details_df = pd.read_csv('datasets/PHIL DATA SET/details.csv')

In [None]:
favs_df = pd.read_csv('datasets/PHIL DATA SET/favs.csv')


In [None]:
person_details_df = pd.read_csv('datasets/PHIL DATA SET/person_details.csv')


In [None]:
person_alternate_names_df = pd.read_csv('datasets/PHIL DATA SET/person_alternate_names.csv')


In [None]:
person_anime_works_df = pd.read_csv('datasets/PHIL DATA SET/person_anime_works.csv')


In [None]:
stats_df = pd.read_csv('datasets/PHIL DATA SET/stats.csv')


### DENIS DATA SET

In [None]:
ratings_df = pd.read_csv('datasets/DENIS DATA SET/ratings.csv')

In [None]:
characters_df = pd.read_csv('datasets/DENIS DATA SET/characters.csv')

In [None]:
character_anime_works_df = pd.read_csv('datasets/DENIS DATA SET/character_anime_works.csv')

In [None]:
person_voice_works_df = pd.read_csv('datasets/DENIS DATA SET/person_voice_works.csv')

In [None]:
profiles_df = pd.read_csv('datasets/DENIS DATA SET/profiles.csv')

In [None]:
recommendations_df = pd.read_csv('datasets/DENIS DATA SET/recommendations.csv')

##### First look

In [None]:
character_nicknames_df.shape

In [None]:
character_nicknames_df.head()

In [None]:
character_nicknames_df.columns
#will list all the columns. Not necessary here but kept as a pattern to follow with the following files

In [None]:
character_nicknames_df.dtypes

In [None]:
character_nicknames_df.describe()
#not needed here because it should just be ids

### Data preparation (cleaning)


##### On the first dataset we may need to check for duplicates.
What does that mean? We have 102 rows that are duplicated over a 37080 rows dataset.
Why is that? Are there some characters that have multiple nicknames, so they are repeated in the dataset?

In [None]:
character_nicknames_df.loc[character_nicknames_df.duplicated()]
#by default will give us the second

In [None]:
character_nicknames_df.query('character_mal_id == 75628')

Mhh they're somehow different so yeah, the same character could have different nicknames.
We want to drop though the ones that are exactly the same.

In [None]:
#this way we drop the duplicates on the first dataset

character_nicknames_df = character_nicknames_df\
    .loc[~character_nicknames_df.duplicated()]\
    .reset_index(drop=True).copy()

#we don't need to use a subset here because there are just 2 columns

Let's check for nan values

In [None]:
character_nicknames_df[character_nicknames_df.isna().any(axis=1)]


In [None]:
#cleaning the df from nan values
character_nicknames_df = character_nicknames_df.dropna()


## SECOND DATASET

##### On the second dataset we may need to check for missing values and/or inconsistent values, since there are no duplicates

In [None]:
details_df.isna().sum()

In [None]:
details_df
#anime details
#japanes title could be dropped?
#members stand for how many users have this anime added to their list.
#explicit_genres is empty so can be removed
#licensor and streaming are mostly empty. Do we care?

In [None]:
details_df.describe()

In [None]:
details_df.query('rank == 1')

In [None]:
details_df[['start_date','season']].query("season.notna()")
#season can be removed? Do we care about the season? We can "calculate" it from the "start_date" field

In [None]:
details_df.query("episodes > 2500")

In [None]:
details_df.dtypes
#scoredby, rank, episodes, year can be an int instead of a float
#start and end dates are not objects but dates
#do we need to swap the empty [] with Nan or not? WE should in order to be able to use the .isna() method and other pandas methods


### THIRD DATASET

In [None]:
favs_df

In [None]:
favs_df.isna().sum()


In [None]:
favs_df.dtypes

In [None]:
favs_df.duplicated().sum()
#there are no duplicates


### FOURTH DATASET

In [None]:
person_alternate_names_df

In [None]:
person_alternate_names_df.dtypes

In [None]:
person_alternate_names_df.isna().sum()

In [None]:
person_alternate_names_df = person_alternate_names_df.dropna()


In [None]:
# person_alternate_names_df.loc[person_alternate_names_df['person_mal_id'].duplicated()]
person_alternate_names_df[person_alternate_names_df.duplicated(subset=['person_mal_id','alt_name'], keep=False)].sort_values(['person_mal_id','alt_name'])


In [None]:
person_alternate_names_df = person_alternate_names_df.drop_duplicates(keep='first')

### FIFTH DATASET

In [None]:
person_details_df

In [None]:
person_details_df.loc[person_details_df['person_mal_id'].duplicated()]
#we found that the duplicates differ for the "relevant_location" field, which has no interest for us so we drop the duplicates
#TODO

In [None]:
person_details_df[person_details_df.isna().any(axis=1)]


In [None]:
person_details_df.dtypes
#we need to change birthday from object to data

In [None]:
person_details_df.isna().sum()
#we have to check the nan values

We can join the two tables person_details_df and person_alternate_names_df having the keys that match.
Putting the alternate names in a new column called alt_name and having a list of those inside

### SIXTH DATASET

In [None]:
person_anime_works_df

In [None]:
person_anime_works_df.dtypes
#the types are correct

In [None]:
person_anime_works_df.isna().sum()
#There's no nan value

### SEVENTH DATASET

In [None]:
stats_df

In [None]:
stats_df.dtypes
#we can swap *_votes to int instead of floats

In [None]:
stats_df.isna().sum()
#there are 430 series without any vote (?)
#we need to understand which ones are they
#TODO

In [None]:
stats_df[stats_df['score_1_votes'].isna()]


In [None]:
details_df.query('mal_id == 61558')

### EIGHTH DATASET 

In [None]:
ratings_df

In [None]:
# we have to understand the sense of "num_watched_episodes" and the link with "is_rewatching"
ratings_df.query('is_rewatching == 1')

In [None]:
ratings_df.dtypes

In [None]:
# change "is_rewatching" from float to int or boolean
ratings_df["is_rewatching"] = ratings_df["is_rewatching"].astype("Int64") 

In [None]:
ratings_df.loc[ratings_df.duplicated(subset=['username', 'anime_id'], keep=False)]

In [None]:
ratings_df[ratings_df.duplicated(subset=['username','anime_id'], keep=False)].sort_values(['username','anime_id'])

In [None]:
# usually we should drop all the occurrence of a duplicate and keep the first
# in this case though, it looks like the latest occurence is the most updated one, contaning more info than the first one
ratings_df = ratings_df.drop_duplicates(subset=['username', 'anime_id'], keep='last')

In [None]:
ratings_df[ratings_df.duplicated(subset=['username','anime_id'], keep=False)].sort_values(['username','anime_id'])

In [None]:
# we check for Nan values.
#TODO
# if it is necessary check if the num_watched_episodes is greater than number of episodes of anime, we can remove the Nan values and put one or zero. 
ratings_df.isna().sum()

In [None]:
# drop "username" with Nan values?
#TODO
ratings_df[ratings_df['username'].isna()]

Check this username that there is in the profiles_df

In [None]:
# we want to see all duplicates to understand if we have to drop or not
#TODO
ratings_df.loc[ratings_df.duplicated(subset=['username', 'anime_id', 'status'], keep=False)]

### NINTH DATASET

In [None]:
characters_df

In [None]:
# check types of dataset columns
characters_df.dtypes

In [None]:
# change "character_mal_id" and "favorites" from float to int
characters_df["character_mal_id"] = characters_df["character_mal_id"].astype("Int64")
characters_df["favorites"] = characters_df["favorites"].astype("Int64")

In [None]:
characters_df.describe()

In [None]:
# we have only 2 rows where all columns are Nan, the rows with Nan values in "name_kanji" and "about" we shouldn't drop because they have other values that are important.
characters_df.isna().sum()

In [None]:
# here we want to check if the Nan values are concentrate in only two rows
characters_df[characters_df['character_mal_id'].isna()]

In [None]:
# Apart "name_kanji" and "about" the others Nan values are concentrate in two rows so we drop the two rows with all columns Nan
characters_df = characters_df.dropna(how='all')


In [None]:
# we want to see all duplicates to understand if we have to drop or not
characters_df.loc[characters_df.duplicated(subset=['character_mal_id', 'url', 'name'], keep=False)]

In [None]:
# we drop the duplicates because they have all same values 
characters_df = characters_df.drop_duplicates(subset=['character_mal_id', 'url', 'name'], keep='first')

### TENTH DATASET

In [None]:
# role of character anime
character_anime_works_df

In [None]:
# check types of columns
character_anime_works_df.dtypes

In [None]:
# check the number of Nan value
character_anime_works_df.isna().sum()

In [None]:
# check the number of duplicates
character_anime_works_df.loc[character_anime_works_df.duplicated(subset=['anime_mal_id', 'character_mal_id'])]

There is no need to clean this dataset 

### ELEVENTH DATASET

In [None]:
person_voice_works_df

In [None]:
person_voice_works_df.dtypes

In [None]:
person_voice_works_df.isna().sum()

In [None]:
# check if the duplicates are in all columns
person_voice_works_df.loc[person_voice_works_df.duplicated(keep=False)]

In [None]:
# drop the duplicates because they have all same values
person_voice_works_df = person_voice_works_df.drop_duplicates(keep='first')

### TWELFTH DATASET

In [None]:
# Should we delete the last five columns
profiles_df

In [None]:
# check if the types are right for each field
profiles_df.dtypes

In [None]:
# change types of columns "birthday" and "joined" from object to date and the others columns that they should be int
profiles_df["birthday"] = pd.to_datetime(profiles_df["birthday"], errors='coerce')
profiles_df["joined"] = pd.to_datetime(profiles_df["joined"], errors='coerce')

In [None]:
profiles_df.isna().sum()

In [None]:
profiles_df.query('username.isna()')

In [None]:
ratings_df.query('username.isna()')

In [None]:
# check if there is any duplicate on "username"
profiles_df.loc[profiles_df.duplicated(subset=['username'], keep='first')]

### THIRTEENTH DATASET

In [None]:
recommendations_df

In [None]:
recommendations_df.dtypes

In [None]:
recommendations_df.isna().sum()

In [None]:
recommendations_df.loc[profiles_df.duplicated(keep='first')]