## Concepts

From one user find best correlated user

Recommend 5 best rated anime from the user

Use the best rated anime from user and recommend 5 more 

Total 10 recommendation is provided 

## Import Library

In [2]:
import pandas as pd
import re
import time
import string

## Variables

In [3]:
target_user_id = 1
target_vote_number = 1000

## Loading Dataset
▪ Loading anime.csv and rating.csv into DataFrame

In [4]:
anime_data = pd.read_csv('anime.csv')
#animeDf = animeDf[:5000]
anime_data = pd.DataFrame(anime_data)
anime_data

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [5]:
rating_data = pd.read_csv('rating.csv')
rating_data = rating_data[:1000000]
rating_data = pd.DataFrame(rating_data)
rating_data

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
...,...,...,...
999995,9358,28999,-1
999996,9358,29067,-1
999997,9358,29093,-1
999998,9358,29095,-1


# Data Preprocessing

## Check Missing Value

In [6]:
rating_data.isna().sum()

user_id     0
anime_id    0
rating      0
dtype: int64

In [7]:
anime_data.isna().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

## Drop Missing Value

In [8]:
anime_data.dropna(axis=0, inplace= True)
anime_data.isna().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

## Check Duplicates

In [9]:
duplicated_anime = anime_data[anime_data.duplicated()].shape[0]
print(f'count of duplicated anime: {duplicated_anime}')

count of duplicated anime: 0


In [10]:
duplicated_rating = rating_data[rating_data.duplicated()].shape[0]
print(f'count of duplicated anime: {duplicated_rating}')

count of duplicated anime: 0


In [11]:
rating_data.drop_duplicates(keep='first',inplace=True)

duplicated_rating = rating_data[rating_data.duplicated()].shape[0]
print(f'count of duplicated anime after removing: {duplicated_rating}')

count of duplicated anime after removing: 0


## Merge Dataset by anime_id

In [12]:
raw_merged_data = pd.merge(rating_data, anime_data, on='anime_id', sort = True)
raw_merged_data

Unnamed: 0,user_id,anime_id,rating_x,name,genre,type,episodes,rating_y,members
0,13,1,-1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,8.82,486824
1,19,1,10,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,8.82,486824
2,21,1,9,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,8.82,486824
3,23,1,9,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,8.82,486824
4,32,1,10,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,8.82,486824
...,...,...,...,...,...,...,...,...,...
999981,6361,34283,7,Nobunaga no Shinobi Episode 0,"Comedy, Historical",Special,1,6.74,937
999982,7114,34283,6,Nobunaga no Shinobi Episode 0,"Comedy, Historical",Special,1,6.74,937
999983,2951,34324,-1,Chiryokumaru,Action,ONA,1,5.40,817
999984,3009,34324,6,Chiryokumaru,Action,ONA,1,5.40,817


## Drop Unnessary Column

In [13]:
raw_merged_data = raw_merged_data.rename(columns={"rating_x": "user_rating"})
raw_merged_data = raw_merged_data.drop('rating_y', axis=1)
raw_merged_data

Unnamed: 0,user_id,anime_id,user_rating,name,genre,type,episodes,members
0,13,1,-1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,486824
1,19,1,10,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,486824
2,21,1,9,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,486824
3,23,1,9,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,486824
4,32,1,10,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,486824
...,...,...,...,...,...,...,...,...
999981,6361,34283,7,Nobunaga no Shinobi Episode 0,"Comedy, Historical",Special,1,937
999982,7114,34283,6,Nobunaga no Shinobi Episode 0,"Comedy, Historical",Special,1,937
999983,2951,34324,-1,Chiryokumaru,Action,ONA,1,817
999984,3009,34324,6,Chiryokumaru,Action,ONA,1,817


## Clean Text

In [14]:
def clean_text(text):
    """
        data preprocessing 
    """
    
    # to lowercase
    text = text.lower()

    # remove sybmols and other words
    text = re.sub(r'<[^>]*>', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'&quot;', '', text)
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
  
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # remove number
    #text = re.sub(r'\d+', '', text)

    # tokenization
    #words = word_tokenize(text)

    # remove stopwords
    #stop_words = set(stopwords.words('english'))
    #words = [word for word in words if word not in stop_words]

    # stemming
    #stemmer = PorterStemmer()
    #words = [stemmer.stem(word) for word in words]

    # join words
    #text = ' '.join(words)
    
    return text

In [15]:
start_time = time.time()
raw_merged_data['name']=raw_merged_data['name'].apply(clean_text)
anime_data['name'] = anime_data['name'].apply(clean_text)
end_time = time.time()
elapsed_time = end_time - start_time
print("process time: ", elapsed_time, " sec.")

process time:  7.2423481941223145  sec.


# PART 1: User Based Correlation

## Find rating_anime_matrix

In [16]:
rating_anime_matrix = raw_merged_data.pivot_table(index='user_id', columns=['anime_id'], values = 'user_rating', fill_value= -1)
rating_anime_matrix.head()

anime_id,1,5,6,7,8,15,16,17,18,19,...,34048,34085,34103,34107,34136,34173,34240,34283,34324,34325
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
5,-1,-1,8,-1,-1,6,-1,6,6,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


## Find anime_rating_matrix

In [17]:
anime_rating_matrix = rating_anime_matrix.transpose()
anime_rating_matrix.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,9349,9350,9351,9352,9353,9354,9355,9356,9357,9358
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
5,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
6,-1,-1,-1,-1,8,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
7,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
8,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


## Find the information of target_user_id

In [18]:
anime_rating_user = anime_rating_matrix[target_user_id]
anime_rating_user.sort_values(ascending= False)

anime_id
8074     10
15451    10
11757    10
11617    10
10739    -1
         ..
3135     -1
3132     -1
3131     -1
3130     -1
34325    -1
Name: 1, Length: 8312, dtype: int64

## Find Correlation between target_user_id with all the other users

In [19]:
similar_users = anime_rating_matrix.corrwith(anime_rating_user).dropna()
similar_users

user_id
1       1.000000
2      -0.000241
3       0.100118
5       0.035965
7       0.076957
          ...   
9353    0.120548
9354   -0.000336
9355    0.107090
9356   -0.000538
9357    0.124187
Length: 8857, dtype: float64

## Format into Data Frame

In [20]:
sorted_similar_users = pd.DataFrame(similar_users, columns=['correlation']).sort_values(ascending= False, by= 'correlation')
sorted_similar_users

Unnamed: 0_level_0,correlation
user_id,Unnamed: 1_level_1
1,1.000000
1776,0.865073
4364,0.594646
809,0.581390
1126,0.573915
...,...
7340,-0.006598
6583,-0.007128
6384,-0.007989
7366,-0.008202


## Find the best correlated user

In [21]:
most_similar_user = sorted_similar_users.index.values
most_similar_user = most_similar_user[1]
most_similar_user

1776

## Find 5 best rated anime from the user that have highest correlation value

In [22]:
user_recommendation = raw_merged_data[raw_merged_data['user_id'] == most_similar_user]
user_recommendation = user_recommendation.sort_values(by= 'user_rating', ascending= False)
user_recommendation[:5]

Unnamed: 0,user_id,anime_id,user_rating,name,genre,type,episodes,members
707280,1776,11757,10,sword art online,"Action, Adventure, Fantasy, Game, Romance",TV,25,893100
699454,1776,11617,9,high school dxd,"Comedy, Demons, Ecchi, Harem, Romance, School",TV,12,398660
781253,1776,15451,9,high school dxd new,"Action, Comedy, Demons, Ecchi, Harem, Romance,...",TV,12,266657


In [23]:
user_recommendation_list = user_recommendation['anime_id'].tolist()
user_recommendation_list

[11757, 11617, 15451]

## PART 2: Item Based Correlation

# Use the user's highest rated anime to recommend other related anime

In [39]:
target_user_rating = rating_data.loc[rating_data['user_id'] == target_user_id].sort_values(by = 'rating', ascending= False)
pd.DataFrame(target_user_rating.head(10))

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10
81,1,11617,10
101,1,15451,10
83,1,11757,10
106,1,16706,-1
99,1,15117,-1
100,1,15437,-1
102,1,15583,-1
103,1,15609,-1
104,1,16011,-1


In [42]:
top_rated_anime_id = target_user_rating['anime_id'].tolist()[0]
top_rated_anime_id

8074

In [43]:
ratings = rating_anime_matrix[top_rated_anime_id]
pd.DataFrame(ratings).rename(columns={top_rated_anime_id: f"{top_rated_anime_id}'s rating"})

Unnamed: 0_level_0,8074's rating
user_id,Unnamed: 1_level_1
1,10
2,-1
3,6
4,-1
5,2
...,...
9354,-1
9355,8
9356,-1
9357,10


In [44]:
similar_animes = rating_anime_matrix.corrwith(ratings)
pd.DataFrame(similar_animes, columns=['correlation'])

Unnamed: 0_level_0,correlation
anime_id,Unnamed: 1_level_1
1,0.134215
5,0.089805
6,0.151635
7,0.047559
8,0.011344
...,...
34173,
34240,0.070185
34283,0.025521
34324,0.018979


## Identify the most correlated anime

In [54]:
sorted_similar_anime = pd.DataFrame(similar_animes, columns=['correlation']).sort_values(by= 'correlation', ascending= False)
sorted_similar_anime

Unnamed: 0_level_0,correlation
anime_id,Unnamed: 1_level_1
8074,1.000000
9515,0.493233
11617,0.464429
6547,0.415676
8841,0.403896
...,...
33658,
33659,
33775,
33905,


## Eliminate the source anime

In [55]:
sorted_similar_anime = sorted_similar_anime[1:]
sorted_similar_anime

Unnamed: 0_level_0,correlation
anime_id,Unnamed: 1_level_1
9515,0.493233
11617,0.464429
6547,0.415676
8841,0.403896
11757,0.402759
...,...
33658,
33659,
33775,
33905,


## Ensure the identified anime is popular
Higher number of votes means more popular

In [46]:
rating_votes = pd.DataFrame(raw_merged_data.groupby('anime_id')['user_rating'].count())
rating_votes

Unnamed: 0_level_0,user_rating
anime_id,Unnamed: 1_level_1
1,2115
5,994
6,1487
7,361
8,61
...,...
34173,1
34240,185
34283,4
34324,2


In [47]:
similar_animes_ratings = sorted_similar_anime.join(rating_votes['user_rating']).sort_values(by = 'correlation', ascending = False)
similar_animes_ratings

Unnamed: 0_level_0,correlation,user_rating
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1
9515,0.493233,846
11617,0.464429,1790
6547,0.415676,3181
8841,0.403896,1343
11757,0.402759,3896
...,...,...
33658,,2
33659,,1
33775,,1
33905,,1


## Get the anime that have higher votes and have higher correlation

In [48]:
similar_popular_animes = sorted_similar_anime.loc[similar_animes_ratings['user_rating']>=target_vote_number].dropna()
similar_popular_animes

Unnamed: 0_level_0,correlation
anime_id,Unnamed: 1_level_1
11617,0.464429
6547,0.415676
8841,0.403896
11757,0.402759
6880,0.401713
...,...
523,0.064780
853,0.063893
431,0.058370
120,0.032620


In [49]:
most_similar_popular_animes = similar_popular_animes[:5]
most_similar_popular_animes_list = most_similar_popular_animes.index.to_list()
most_similar_popular_animes_list

[11617, 6547, 8841, 11757, 6880]

# Result Showing

In [50]:
final_recommendation_result = user_recommendation_list.copy()
final_recommendation_result += most_similar_popular_animes_list.copy()
final_recommendation_result

[11757, 11617, 15451, 11617, 6547, 8841, 11757, 6880]

## Show Target User's Rating

In [51]:
target_user = rating_data.loc[rating_data['user_id'] == target_user_id].sort_values(by= 'rating', ascending= False)
pd.DataFrame(target_user.head(10))

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10
81,1,11617,10
101,1,15451,10
83,1,11757,10
106,1,16706,-1
99,1,15117,-1
100,1,15437,-1
102,1,15583,-1
103,1,15609,-1
104,1,16011,-1


## Show Target Anime's information 

In [52]:
target_anime = anime_data.loc[anime_data['anime_id'] == top_rated_anime_id]
pd.DataFrame(target_anime)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
1709,8074,highschool of the dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892


## Show Final Recommendation Result

In [53]:
recommended_anime = anime_data.loc[anime_data['anime_id'].isin(final_recommendation_result)]
pd.DataFrame(recommended_anime)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
159,6547,angel beats,"Action, Comedy, Drama, School, Supernatural",TV,13,8.39,717796
724,15451,high school dxd new,"Action, Comedy, Demons, Ecchi, Harem, Romance,...",TV,12,7.87,266657
804,11757,sword art online,"Action, Adventure, Fantasy, Game, Romance",TV,25,7.83,893100
1057,11617,high school dxd,"Comedy, Demons, Ecchi, Harem, Romance, School",TV,12,7.7,398660
1123,8841,kore wa zombie desu ka,"Action, Comedy, Ecchi, Harem, Magic, Supernatural",TV,12,7.67,295782
1620,6880,deadman wonderland,"Action, Horror, Sci-Fi",TV,12,7.48,453454
