In [1]:
import numpy as np
import pandas as pd
import random

# Read 500000 records from MergedData-APr4 and pick 50000 unique movies from the list

In [2]:
# FinalMergedIMDBMovieData-4Apr.csv contains data in the following format -
#Unnamed: 0	tconst	titleType	title	originalTitle	isAdult	startYear	endYear	runtimeMinutes	genres	averageRating	numVotes	ordering	nconst	category	job	characters
#0	0	tt0000001	short	Carmencita	Carmencita	0	1894	\N	1	Documentary,Short	5.7	1966.0	1.0	nm1588970	self	\N	["Self"]
# FinalMergedIMDBMovieData-4Apr.csv is the output of imdb_ED_script.ipynb

i=0
movie_list = []
chunk_size = 50000
total_req_size = 550000
for chunk in pd.read_csv('FinalMergedIMDBMovieData-4Apr.csv', low_memory=False, chunksize=chunk_size):
    for row, data in chunk.iterrows():
        movie_list.append(data)
        i+=1
    if i>total_req_size:
        break
imdb_df = pd.DataFrame(movie_list)


In [3]:
# Number of unique movies
# imdb_df = imdb_df.drop(columns=['Unnamed: 0'])
print(imdb_df.title.nunique())
imdb_df.head()


540


Unnamed: 0.1,Unnamed: 0,tconst,titleType,title,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,ordering,nconst,category,job,characters
0,0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",5.7,1966.0,1.0,nm1588970,self,\N,"[""Self""]"
1,1,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",5.7,1966.0,2.0,nm0005690,director,\N,\N
2,2,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",5.7,1966.0,3.0,nm0374658,cinematographer,director of photography,\N
3,3,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",5.8,263.0,1.0,nm0721526,director,\N,\N
4,4,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",5.8,263.0,2.0,nm1335271,composer,\N,\N


In [None]:
imdb_df.tconst.nunique()

In [None]:
imdb_df.to_csv('FinalMergedIMDBMovieData-4Apr_Subset.csv', index=False)

In [None]:
t_group = imdb_df.groupby('title').agg({'tconst':['nunique', 'unique']}).reset_index()

In [None]:
t_group.head()

In [None]:
temp = t_group[t_group[('tconst', 'nunique')]>1]
len(temp)
temp.to_csv('title_with_more_than_one_id.csv')

# Generate random userIDs - 200000

In [None]:
user_list = []
for i in range(100000,300001):
    user_list.append('ID'+str(i))

In [None]:
(user_list)[3123]


# Generate random user-movie interaction data 

In [None]:
title_list = list(imdb_df.title.unique())
len(title_list) * 0.05


In [None]:

random_seed = 414
max_data_size = 3000

In [None]:
# Users with no watch counts and movies with no watch counts - 10% of users, 5% of movies
no_watch_user_count = len(user_list) * 0.1
no_watch_movie_count = len(title_list) * 0.05
user_list_null = user_list[0:no_watch_user_count]
title_list_null = title_list[0:no_watch_movie_count]

user_list = user_list[no_watch_user_count:]
title_list = title_list[no_watch_movie_count:]

print(len(user_list))
print(len(user_list_null))
print(len(title_list))
print(len(title_list_null))


# 1. Selecting a particular user and populating the movies for the user

In [None]:
# 1.Pick random users from user_list. This can be varied with the user_list_size variable below.
# 2. Set a max limit for number of movies a user can watch. Can be set with the max_data_size variable
# 3.For each unique user, generate permutations of various movies watched by altering the user_list_size and max_data_size variables.
# 

In [None]:
# 1. PIck random users from user_list
# 2. Set max_data_size
random_seed = 675
max_data_size = 1600
user_list_size = 3000
user_list_temp = []
random_indices = list(np.random.randint(low=0, high=len(user_list),size=user_list_size))
for i in range(0,len(random_indices)):
    user_list_temp.append(user_list[random_indices[i]])
len(user_list_temp)

In [None]:
random.seed(random_seed)
final_df = pd.DataFrame(columns= ['user_id', 'title', 'rating', 'interest'])
total_size = 0

# 3. For each unique user, get permutation combination of various movies
for user in user_list_temp:
    temp_df = pd.DataFrame(columns= ['user_id', 'title', 'rating', 'interest'])
    data_size = random.randint(1,max_data_size)
    total_size += data_size
    #print('Generated ', str(data_size), ' new data points for user - ', str(user))
    #for i in range(1, data_size):
    temp_df.title = np.random.choice(title_list, size=data_size)
    temp_df.rating = list(np.random.randint(low=1, high=5,size=data_size))
    temp_df.interest = list(np.random.rand(data_size))
    temp_df.user_id = np.random.choice([user], size=data_size) # Assigning same userid to all records in a loop. 
    
    final_df = pd.concat([temp_df, final_df], ignore_index=True, axis=0)
print('Total rows generated - ', str(total_size))


In [None]:
print('Total rows generated - ', str(total_size))
# write each chunk of generated data to a csv.
final_df.to_csv('imdb_generated_data-7.csv')

# RUn the above cell as required by changing the following params to get different results - 
# random_seed = 675
# max_data_size = 1600
# user_list_size = 3000

In [None]:
final_df.columns

In [None]:
print(final_df.user_id.nunique())
final_df.title.nunique()

# 2. Selecting a particular movie and populating the users for the user

In [None]:
# Pick random titles 
# TODO: Vary low param of np.random.randint() - 
# 1.start by keeping low and high param values very close. keep Descreasing low param value and iterate for multiple steps.
# Do the same for low and high params - data_size = random.randint(max_data_size-2000,max_data_size) 5 cells below (actual generation of data cell) 

random_seed = 890
max_data_size = 6000
title_list_size = 200
title_list_temp = []
random_indices = list(np.random.randint(low=int(len(title_list)/10), high=len(title_list),size=title_list_size))
for i in range(0,len(random_indices)):
    title_list_temp.append(title_list[random_indices[i]])
len(title_list_temp)

In [None]:
# Experimental -
# Generate lots of movies(1000-1200 movies) with high user voew counts (user view counts > 4500)


In [None]:
#### Experimental - Increasing user counts of movies for large viewership
# Curent status - Due to large u=number of movies, user counts for movies are concentrated on the lower side, 
# i.e, movies have low user count between - (1-3000 users per movie)
# Tryin to increase the user counts in the 4500-5500 range
df = pd.read_csv('imdb_subset_generated_data_merged.csv', low_memory=False)


In [None]:
title_counts = df.groupby('title').agg({'user_id':['count','nunique']}).reset_index().sort_values(('user_id', 'nunique'))

In [None]:
high_title_list = title_counts[title_counts[('user_id', 'nunique')]>=4500]
high_title_list

In [None]:
# Pick random titles 
# TODO: Vary low param of np.random.randint() - 
# 1.start by keeping low and high param values very close. keep Descreasing low param value and iterate for multiple steps.
# Do the same for low and high params - data_size = random.randint(max_data_size-2000,max_data_size) 5 cells below (actual generation of data cell) 

random_seed = 890
max_data_size = 6000
title_list_size = 200
title_list_temp = []
random_indices = list(np.random.randint(low=int(len(title_list)/10), high=len(title_list),size=title_list_size))
for i in range(0,len(random_indices)):
    title_list_temp.append(title_list[random_indices[i]])
len(title_list_temp)

In [None]:
random.seed(random_seed)
final_df = pd.DataFrame(columns= ['user_id', 'title', 'rating', 'interest'])

total_size = 0
print(final_df.shape)
# 1. For each unique user, get permutation combination of various movies
for title in title_list_temp:
    temp_df = pd.DataFrame(columns= ['user_id', 'title', 'rating', 'interest'])
    data_size = random.randint(max_data_size-1000,max_data_size)
    total_size += data_size
    print('Generated ', str(data_size), ' new data points for title - ', str(title))
    #for i in range(1, data_size):
    temp_df.title = [title] * data_size
    temp_df.rating = list(np.random.randint(low=1, high=5,size=data_size))
    temp_df.interest = list(np.random.rand(data_size))
    temp_df.user_id = np.random.choice(user_list, size=data_size)
    
    final_df = pd.concat([temp_df, final_df], ignore_index=True, axis=0)
print('Total rows generated - ', str(total_size))


In [None]:
final_df.to_csv('imdb_generated_data-9_high_user_count.csv')
# RUn the above cell as required by changing the following params to get different results - 
# random_seed = 675
# max_data_size = 1600
# title_list_size = 3000

In [None]:
print(final_df.user_id.nunique())
final_df.title.nunique()

In [None]:
final_df.shape

# Data validation - Check if the generated data follows any kind of uniform distribution, if so delete that chunk of data|

In [None]:
df1 = pd.read_csv('imdb_generated_data-1.csv')
df2 = pd.read_csv('imdb_generated_data-2.csv')
df3 = pd.read_csv('imdb_generated_data-3.csv')
df4 = pd.read_csv('imdb_generated_data-4.csv')
df5 = pd.read_csv('imdb_generated_data-5.csv')
df6 = pd.read_csv('imdb_generated_data-6.csv')
df7 = pd.read_csv('imdb_generated_data-7_9357_unique_titles.csv')
df8 = pd.read_csv('imdb_generated_data-8_high_user_count.csv')
df9 = pd.read_csv('imdb_generated_data-9_high_user_count.csv')
df = pd.concat([df1,df2,df3,df4,df5,df6,df7,df8,df9])

In [None]:
df.to_csv('generated_data/imdb_subset_generated_data_merged_final.csv', index=False)

In [None]:
df.shape

In [None]:
print(df.user_id.nunique())
df.title.nunique()

In [None]:
title_counts = final_df.groupby('title').agg({'user_id':['count','nunique']}).reset_index()
title_counts.to_csv('title_watch_counts_imdb_high_user_count_2.csv', index=False)

In [None]:
user_counts = final_df.groupby('user_id').agg({'title':['count','nunique']}).reset_index()
user_counts.to_csv('user_watch_counts_imdb_high_user_count_2.csv', index=False)

# Merge the additional movie info to all records

In [None]:
df = pd.read_csv('generated_data/imdb_subset_generated_data_merged_final.csv')
print(df.columns)
# imdb_df.columns

In [None]:
print(df.shape)
imdb_df.title.nunique()

In [None]:
df.title.nunique()

In [None]:
imdb_df['crew_job'] = None
imdb_df['character_name'] = None
imdb_df.head(1)

In [None]:
imdb_df = imdb_df[imdb_df['title'].isin(df.title.unique())]

In [None]:
# print(merged_df.shape)
imdb_df.shape

In [None]:
# Collate all crew info to a single row for each movie of the format - 
# crew_job ->[{'nconst1':'category'}, {'nconst12':'category'}, ....]
# character_name -> [{'nconst1':'char_name'}, {'nconst12':'char_name'}, ....]
# Remove following columns  - ordering, nconst, category, job, category, characters, tconst

imdb_df_temp = imdb_df.loc[0:10, :]
title_group = imdb_df.groupby('title')
# temp_df = pd.DataFrame(columns=['crew_job', 'character_name'])
crew_job = []
crew_name = []
title = []
title_type =[]
start_year = []
end_year = []
runtime = []
genres = []
avg_rating = []
num_votes = []

for key, group in title_group:
    c_job = []
    c_name = []   
#     print(group)
    for row, data in group.iterrows():
        if data['category'] != '\\N':
            c_job.append({data['nconst']:data['category']})
        if data['characters'] != '\\N':
            c_name.append({data['nconst']:data['characters']})
            
    crew_job.append(c_job)
    crew_name.append(c_name)
    title.append(group.reset_index().loc[0,'title'])
    title_type.append(group.reset_index().loc[0,'titleType'])
    start_year.append(group.reset_index().loc[0,'startYear'])
    end_year.append(group.reset_index().loc[0,'endYear'])
    runtime.append(group.reset_index().loc[0,'runtimeMinutes'])
    genres.append(group.reset_index().loc[0,'genres'])
    avg_rating.append(group.reset_index().loc[0,'averageRating'])
    num_votes.append(group.reset_index().loc[0,'numVotes'])
    
    imdb_df_new = pd.DataFrame(data= {'title':title, 'title_type':title_type, 'start_year':start_year, 'end_year':end_year,
                                      'runtime_min':runtime, 'genres':genres, 'avg_rating':avg_rating, 'num_votes':num_votes,
                                         'crew_job': crew_job, 'charachter_names':crew_name})
    
#     imdb_df[imdb_df['title']==key,'crew_job'] = crew_job
#     imdb_df[imdb_df['title']==key,'character_name'] = crew_name


In [None]:
print(imdb_df_new.shape)
imdb_df.shape

In [None]:
imdb_df_new.tail()

In [None]:
temp_df = pd.DataFrame(data={'crew_job':crew_job, 'character_name':crew_name})

In [None]:
print(temp_df.shape)
temp_df.head()

In [None]:
print(df.shape)
imdb_df_new.shape

In [None]:
merged_imdb_df = pd.merge(df, imdb_df_new, how='left', on='title')

In [None]:
merged_imdb_df = merged_imdb_df.drop(columns=['Unnamed: 0'])

In [None]:
print(merged_imdb_df.shape)
merged_imdb_df.head()

In [None]:
akas_df = pd.read_csv('title.akas.tsv/data.tsv', sep='\t', low_memory=False)

In [None]:
akas_df = akas_df[akas_df['title'].isin(merged_imdb_df.title.unique())]

In [None]:
print(akas_df.shape)
print(akas_df.title.nunique())
akas_df.head()

In [None]:
akas_df['language'] = akas_df['language'].apply(lambda x : random.choice(akas_df['language'].unique()) if x=='\\N' else x)

In [None]:
akas_df.head()

In [None]:
# akas_df['language'] = akas_df['language'].fillna(random.choice(akas_df['language'].unique()))

In [None]:
for index, row in akas_df.iterrows():
    if row['language'] == None:
        akas_df.loc[index, 'language'] = random.choice(akas_df['language'].unique())

In [None]:
akas_df.head()

In [None]:
akas_df.language.value_counts()

In [None]:
akas_df.language.value_counts()

In [None]:
title_group = akas_df.groupby('title')
title_list = []
language_list = []
for key, group in title_group:
    if group.reset_index().loc[0,'language'] != None:
        title_list.append(group.reset_index().loc[0,'title'])
        language_list.append(group.reset_index().loc[0,'language'])
        pass
akas_new_df = pd.DataFrame(data={'title':title_list, 'language':language_list})

In [None]:
akas_new_df.shape

In [None]:
akas_new_df.title.nunique()

In [None]:
akas_new_df.head()

In [None]:
merged_imdb_df_new = pd.merge(merged_imdb_df, akas_new_df, how='left', on='title')

In [None]:
merged_imdb_df_new.shape

In [None]:
merged_imdb_df_new.title.nunique()

In [None]:
merged_imdb_df_new.head()

In [None]:
merged_imdb_df_new.to_csv('generated_data/imdb_subset_generated_data_merged_with_lang_genre.csv', index=False)

In [None]:
print('titles - ', merged_imdb_df_new.title.nunique())
print('users - ', merged_imdb_df_new.user_id.nunique())
print('languages - ', merged_imdb_df_new.language.nunique())
print('Genre - ', merged_imdb_df_new.genres.nunique())

In [None]:
merged_imdb_df_new.language.value_counts()

In [None]:
lan_list = ['en', 'fr', 'hi', 'uk', 'he']
temp_df = merged_imdb_df_new[merged_imdb_df_new['language'].isin(lan_list)]
temp_df.shape

In [None]:
print(temp_df.title.nunique())
print(temp_df.user_id.nunique())
temp_df.genres.nunique()

In [None]:
temp_df.info()

In [None]:
temp_df.to_csv('demo_data.csv', index=False)

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(temp_df, test_size=0.3)

In [None]:
test.info()

In [None]:
train.to_csv('demo_data_train.csv', index=False)