# Preprocessing the Main Dataset

In [1]:
import re
import gzip
import shutil
import pandas as pd
import matplotlib.pyplot as plt
import itertools

from time import sleep
from tqdm import tqdm

In [None]:
# Writing the text file
with gzip.open('movies.txt.gz', 'rb') as f_in:
    with open('movies.txt', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [3]:
# Opening the text file
with open('movies.txt', errors='ignore') as f:
    lines = f.readlines()

In [None]:
# Quick view into the file
lines[:9]

In [None]:
array = np.zeros((len(lines)/9, 3), dtype=)
index = 0
for i, line in tqdm(enumerate(lines)):
    if i%9 == 0:
        index += 1
    elif 'product/productId' in line:
        cleaned_line = re.sub('product/productId: |\n', "", line)
        array[index][0] = cleaned_line
    elif 'review/userId' in line:
        cleaned_line = re.sub('review/userId: |\n', "", line)
        array[index][1] = cleaned_line
    elif 'review/score' in line:
        cleaned_line = re.sub('review/score: |\n', "", line)
        array[index][2] = cleaned_line
    else:
        continue

In [4]:
# Keeping only the lines which contain the product ID, the reviewer ID and the rating
clean_reviews = [line for line in lines if re.findall(r"product/productId|review/userId|review/score", line) != []]

In [None]:
# Quick look into the filtered dataset
clean_reviews[:9]

In [None]:
# Counting the number of each of the three categories
num_prodcts = len([line for line in clean_reviews if "product/productId" in line])
num_users = len([line for line in clean_reviews if "review/userId" in line])
num_score = len([line for line in clean_reviews if "review/score" in line])
# Checking if there are the same amount of elements in each category (rough checking that reviews are complete)
if num_prodcts == num_users == num_score:
    print(num_prodcts, "complete reviews with each three pieces of information")

In [5]:
# Cleaning the remaining lines, removing "\n" at end of the lines and category names in front
cat_list = ['product/productId: ', 'review/userId: ', 'review/score: ', '\n']   
final = [re.sub("|".join(cat_list), "", line) for line in clean_reviews]

In [6]:
# !! This code line takes really long to run
# Grouping lines into list of three elements corresponding to one review
chunks = [final[i:i+3] for i in range(0, len(final), 3)]

In [None]:
# Quick look at the new format
chunks[:3]

In [34]:
# Counting the number of different movies, users and ratings
all_movies = set([rvw[0] for rvw in chunks])
all_users = set([rvw[1] for rvw in chunks])
all_stars = set([rvw[2] for rvw in chunks])

In [35]:
print("Number of different movies reviewed:", len(all_movies))
print("Number of different users:", len(all_users))
print("Number of different scores:", len(all_stars))

Number of different movies reviewed: 253059
Number of different users: 889176
Number of different scores: 5


In [None]:
# Ratings go from 1 to 5
all_stars

In [8]:
# Filtering out the bad reviews = 1 or 2 (soft method)
good_reviews = [rvw for rvw in chunks if rvw[2]!='1.0' and rvw[2]!='2.0']

In [None]:
len(good_reviews) 

In [9]:
# ALternative filtering where good reviews only 4 or 5 (hard method)
good_reviews_alt = [rvw for rvw in good_reviews if rvw[2]!='3.0']
len(good_reviews_alt)

6035359

In [None]:
# !! Similar as above
# Organising good reviews by user, ineffective method not used
review_dict = [{rvw[1]: [rvw[0], rvw[2]]} for rvw in good_reviews_alt]

In [10]:
# Create a dataframe with the data
df = pd.DataFrame(good_reviews_alt, columns=["Movie", "User", "Score"])

In [None]:
df.head(10)

In [11]:
# Number of times one user appears (= how many reviews by the user)
num_rvw_users = df['User'].value_counts()

In [None]:
len(df['User'])

In [39]:
# Look at most reviews
print(len(num_rvw_users))
num_rvw_users[:10]

738205


A16CZRQL23NOIW    10151
A2NJO6YE954DBH     8127
A10ODC971MDHV8     7548
A35ZK3M8L9JUPX     7507
A39CX0EE4BZCZC     6641
ANCOMAI0I7LVG      6585
A328S9RN3U5M68     6085
A3LZGLA88K0LA0     5880
A3UDYY6L2NH3JS     5737
AIMR915K4YCN       5674
Name: User, dtype: int64

In [12]:
# Create a dictionary of user as key and number of reviews as value
dict_user = dict(num_rvw_users)

In [13]:
# All keys of reviewers with more than one review
over_one = [key for key in dict_user.keys() if dict_user[key]!=1]
len(over_one)

505797

In [14]:
# The final preprocessed and filtered dataframe with only the good reviews and users with more than one review
final_df = df[df['User'].isin(over_one)].reset_index(drop=True)

In [None]:
final_df.info()

In [36]:
# Saving the dataframe
final_df.to_csv('PositiveReviews.csv')

In [113]:
final_df.head()

Unnamed: 0,Movie,User,Score
0,B003AI2VGA,A1I7QGUDP043DG,5.0
1,B00006HAXW,AD4CDZK7D31XP,5.0
2,B00006HAXW,A3Q4S5DFVPB70D,5.0
3,B00006HAXW,A2P7UB02HAVEPB,5.0
4,B00006HAXW,A2TX99AZKDK0V7,4.0


In [146]:
len(final_df)

5802951

## Prepare the Dataset with the Genres

In [111]:
genre_df = pd.read_csv("labels.csv", names=['Movie', 'Category'])
genre_df.head()

Unnamed: 0,Movie,Category
0,B0029Z8KCY,"['Movies & TV', 'Boxed Sets', 'Anime']"
1,B00004CQT3,"['Movies & TV', 'Genre for Featured Categories..."
2,B00004CQT4,"['Movies & TV', 'Genre for Featured Categories..."
3,B006JIUN2W,[]
4,B004MPGBHK,[]


In [139]:
len(set(genre_df['Movie']))

253059

In [164]:
# Filter out Items where no category is given
cleaned_genre = genre_df.loc[(genre_df['Category']!='[]')].reset_index(drop=True)
cleaned_genre.head()

Unnamed: 0,Movie,Category
0,B0029Z8KCY,"['Movies & TV', 'Boxed Sets', 'Anime']"
1,B00004CQT3,"['Movies & TV', 'Genre for Featured Categories..."
2,B00004CQT4,"['Movies & TV', 'Genre for Featured Categories..."
3,B000009DX2,"['Movies & TV', 'Art House & International']"
4,B0071AD95K,"['Movies & TV', 'Genre for Featured Categories..."


In [165]:
labels = set(list(cleaned_genre["Category"]))
all_labels = list(cleaned_genre['Category'])
len(labels)
all_labels[0]

"['Movies & TV', 'Boxed Sets', 'Anime']"

In [166]:
lst = []
for label in labels:
    lbl = re.sub("\[|\]|\"|\'", "", label)
    lst.append(lbl.split(","))
lst[:3]

[['Movies & TV', ' ABC News', ' ABC News Classics'],
 ['Movies & TV', ' Art House & International', ' By Country', ' Sweden'],
 ['Books', ' Arts & Photography']]

In [167]:
movies = [label for label in lst if label[0]=="Movies & TV"]
print(len(movies))
# print(movies)

715


In [131]:
all_cat = set([l[0] for l in lst])
len(all_cat)
print(all_cat)

{'Health & Household', 'Movies & TV', 'CDs & Vinyl', 'Tools & Home Improvement', 'Toys & Games', 'Video Games', 'Home & Kitchen', 'Clothing', 'Industrial & Scientific', 'Beauty & Personal Care', 'Office Products', 'Books', 'Automotive', 'Electronics', 'Sports & Outdoors', 'Arts', 'Cell Phones & Accessories', 'Software', 'Musical Instruments', 'Patio', 'Pet Supplies', 'Grocery & Gourmet Food'}


In [123]:
clean = [re.sub("\[|\]|\"|\'", "", label) for label in all_labels]
cleaned_genre['Clean Categories'] = [label.split(',') for label in clean]
cleaned_genre.head()

Unnamed: 0,Movie,Clean Categories
0,B0029Z8KCY,"[Movies & TV, Boxed Sets, Anime]"
1,B00004CQT3,"[Movies & TV, Genre for Featured Categories, ..."
2,B00004CQT4,"[Movies & TV, Genre for Featured Categories, ..."
3,B000009DX2,"[Movies & TV, Art House & International]"
4,B0071AD95K,"[Movies & TV, Genre for Featured Categories, ..."


In [129]:
cleaned_genre['Clean Categories'][0]

['Movies & TV', ' Boxed Sets', ' Anime']

In [136]:
final_genre = cleaned_genre[cleaned_genre['Clean Categories'].isin(movies)].reset_index(drop=True)
final_genre

Unnamed: 0,Movie,Clean Categories
0,B0029Z8KCY,"[Movies & TV, Boxed Sets, Anime]"
1,B00004CQT3,"[Movies & TV, Genre for Featured Categories, ..."
2,B00004CQT4,"[Movies & TV, Genre for Featured Categories, ..."
3,B000009DX2,"[Movies & TV, Art House & International]"
4,B0071AD95K,"[Movies & TV, Genre for Featured Categories, ..."
...,...,...
207880,B003FZW7VC,"[Movies & TV, Genre for Featured Categories, ..."
207881,B00447L4KA,"[Movies & TV, Studio Specials, Warner Home V..."
207882,B003S1UNZU,"[Movies & TV, Boxed Sets, Documentary]"
207883,B00008G1Y9,"[Movies & TV, Studio Specials, Warner Home V..."


## Towards Merging the Information of both Datasets

In [137]:
lst_movies_df1 = set(final_df['Movie'])
lst_movies_df2 = set(final_genre['Movie'])

In [138]:
print(len(lst_movies_df1))
print(len(lst_movies_df2))

213341
207885


In [141]:
len(lst_movies_df1)-len(lst_movies_df2)

5456

In [147]:
not_movies = list(lst_movies_df1 - lst_movies_df2)
len(not_movies)
# not_movies[:10]

38399

In [145]:
clean_final = final_df[final_df['Movie'].isin(lst_movies_df2)].reset_index(drop=True)
clean_final

Unnamed: 0,Movie,User,Score
0,B00004CQT3,A34KFDQ5KBHZA5,5.0
1,B00004CQT3,A1CIW2OEVAJRM2,5.0
2,B00004CQT3,A1VJCDRXUQVXBM,5.0
3,B00004CQT3,A2IMLPUXYQJTSY,5.0
4,B00004CQT3,A1D12NAC1U12F0,5.0
...,...,...,...
4802661,B000GRUN4A,A3H4EBR4RZIB9Y,5.0
4802662,B003F32YGC,AOI2LGQR941L7,5.0
4802663,B003F32YGC,ACDKM2C99SFCP,5.0
4802664,6304952198,A23KKLV2CD39U8,4.0


In [148]:
complete_df = pd.merge(final_genre, clean_final)
complete_df.columns

Index(['Movie', 'Clean Categories', 'User', 'Score'], dtype='object')

In [150]:
complete_df.head(5)

Unnamed: 0,Movie,Clean Categories,User,Score
0,B0029Z8KCY,"[Movies & TV, Boxed Sets, Anime]",AFV2584U13XP3,4.0
1,B00004CQT3,"[Movies & TV, Genre for Featured Categories, ...",A34KFDQ5KBHZA5,5.0
2,B00004CQT3,"[Movies & TV, Genre for Featured Categories, ...",A1CIW2OEVAJRM2,5.0
3,B00004CQT3,"[Movies & TV, Genre for Featured Categories, ...",A1VJCDRXUQVXBM,5.0
4,B00004CQT3,"[Movies & TV, Genre for Featured Categories, ...",A2IMLPUXYQJTSY,5.0


In [151]:
complete_df.shape

(4802666, 4)

In [185]:
type_movies = [mv[1:] for mv in movies]

[[' ABC News', ' ABC News Classics'],
 [' Art House & International', ' By Country', ' Sweden'],
 [' Independently Distributed', ' Musicals & Performing Arts'],
 [' Jewish Heritage', ' Music & Dance'],
 [' Studio Specials',
  ' Lionsgate Home Entertainment',
  ' Lionsgate DVDs Under $10']]

In [191]:
all_paths = set(list(itertools.chain(*type_movies)))

{' Men in Black', ' Roger Moore', ' European Cinema', ' Disney Channel Series', ' Patricia Walden', ' Celine', ' Eve', ' Mystery & Thrillers', ' Chinese', ' CBS News', ' Alice', ' Gaiam', ' MGM Movie Time', ' Modern Adaptations', ' French New Wave', ' Arts & Entertainment', ' Harry Potter and the Order of the Phoenix', ' Megadeth', ' The Comedy Central Store', ' TLC', ' Joan of Arc', ' All Disney Titles', ' Pre & Post Natal', ' Anime & Manga', ' Widescreen', ' MGM Home Entertainment', ' Tai Chi', ' Groban', ' Valli', ' Czech', ' Simon', ' Eric', ' Mens Ice Hockey', ' Television', ' Vaughan', ' The Comedies', ' Bruce', ' Sarah', ' Lou', ' TV', ' Godsmack', ' All A&E Titles', ' Special Editions', ' Bon Jovi', ' Science Fiction & Fantasy', ' Celtic Woman', ' Focus Features', ' Religion', ' ABC TV Shows', ' Dune', ' Wall-E', ' FX', ' Docurama', ' By Animator', ' High-Definition DVDs', ' Dream Theater', ' MGM Midnite Movies', ' Sean Connery', ' King David', ' Deportes', ' Crow', ' Monster M

In [192]:
complete_df.to_csv('CompleteDataWithGenres.csv')

In [197]:
small_df = complete_df.sample(frac=0.01, random_state=1)
len(small_df)

48027

In [198]:
small_df.to_csv('1%SampleDataset.csv')

## Reducing the Dataset Size by Genre

In [215]:
all_data = pd.read_csv("CompleteDataWithGenres.csv") 

In [11]:
all_data.head()

Unnamed: 0.1,Unnamed: 0,Movie,Clean Categories,User,Score
0,0,B0029Z8KCY,"['Movies & TV', ' Boxed Sets', ' Anime']",AFV2584U13XP3,4.0
1,1,B00004CQT3,"['Movies & TV', ' Genre for Featured Categorie...",A34KFDQ5KBHZA5,5.0
2,2,B00004CQT3,"['Movies & TV', ' Genre for Featured Categorie...",A1CIW2OEVAJRM2,5.0
3,3,B00004CQT3,"['Movies & TV', ' Genre for Featured Categorie...",A1VJCDRXUQVXBM,5.0
4,4,B00004CQT3,"['Movies & TV', ' Genre for Featured Categorie...",A2IMLPUXYQJTSY,5.0


In [16]:
the_movies = list(all_data["Clean Categories"])

In [17]:
the_movies[:5]

["['Movies & TV', ' Boxed Sets', ' Anime']",
 "['Movies & TV', ' Genre for Featured Categories', ' Kids & Family']",
 "['Movies & TV', ' Genre for Featured Categories', ' Kids & Family']",
 "['Movies & TV', ' Genre for Featured Categories', ' Kids & Family']",
 "['Movies & TV', ' Genre for Featured Categories', ' Kids & Family']"]

In [18]:
ctg = []
for ca in the_movies:
    c = re.sub("\[|\]|\"|\'", "", ca)
    ctg.append(c.split(","))

In [19]:
ctg[:5]

[['Movies & TV', '  Boxed Sets', '  Anime'],
 ['Movies & TV', '  Genre for Featured Categories', '  Kids & Family'],
 ['Movies & TV', '  Genre for Featured Categories', '  Kids & Family'],
 ['Movies & TV', '  Genre for Featured Categories', '  Kids & Family'],
 ['Movies & TV', '  Genre for Featured Categories', '  Kids & Family']]

In [20]:
all_clean = [mv[1:] for mv in ctg]

In [32]:
final_all_clean = []
final_for_df = []
for mv in all_clean:
    mov = []
    for c in mv:
        it = c.strip()
        mov.append(it)
        final_all_clean.append(it)
    final_for_df.append(mov)

In [34]:
final_for_df[:5]

[['Boxed Sets', 'Anime'],
 ['Genre for Featured Categories', 'Kids & Family'],
 ['Genre for Featured Categories', 'Kids & Family'],
 ['Genre for Featured Categories', 'Kids & Family'],
 ['Genre for Featured Categories', 'Kids & Family']]

In [26]:
from collections import Counter
counts = Counter(final_all_clean)

In [204]:
counts.most_common(100)

[('Studio Specials', 1805893),
 ('Genre for Featured Categories', 1317997),
 ('All Titles', 566628),
 ('Warner Home Video', 518428),
 ('Drama', 469854),
 ('Action & Adventure', 440784),
 ('Science Fiction & Fantasy', 393545),
 ('Comedy', 391450),
 ('Science Fiction', 378263),
 ('20th Century Fox Home Entertainment', 363494),
 ('Sony Pictures Home Entertainment', 312043),
 ('All Sony Pictures Titles', 303072),
 ('Universal Studios Home Entertainment', 286539),
 ('All Universal Studios Titles', 286539),
 ('General', 224106),
 ('Paramount Home Entertainment', 201875),
 ('Animation', 198698),
 ('Art House & International', 184574),
 ('Action', 148033),
 ('MGM Home Entertainment', 139420),
 ('All MGM Titles', 137911),
 ('Kids & Family', 131610),
 ('Musicals', 122976),
 ('All Fox Titles', 119921),
 ('Lionsgate Home Entertainment', 112709),
 ('Horror', 112251),
 ('Musicals & Performing Arts', 109435),
 ('Movies', 107696),
 ('By Original Language', 96551),
 ('All Lionsgate Titles', 92090),
 ('

In [217]:
# The following gives still aroung 2140243 reviews which is 44,56 % 
# defined_genres = ['Drama','Action & Adventure', 'Science Fiction & Fantasy', 'Comedy', 'Animation','Horror', 'Documentary', 'Musicals', 'Sports', 'Kids & Family']
defined_genres = ['Action', 'Horror', 'Documentary', 'Musicals', 'Anime & Manga']
length = [len(it) for it in defined_genres]

In [177]:
length.sort()
set(length)

{5, 6, 8, 9, 11, 18}

In [218]:
import random
x = []
for i in tqdm(final_for_df):
    if any(it in i for it in defined_genres):
        s = set(i) & set(defined_genres)
        if len(s)==1:
            x.append("".join(s))
        else:
            x.append("".join(random.sample(s, 1)))
    else:
        x.append("NaN")

100%|████████████████████████████████████████████████████████████████████| 4802666/4802666 [00:14<00:00, 339158.27it/s]


In [147]:
x[:5]

['NaN', 'Kids & Family', 'Kids & Family', 'Kids & Family', 'Kids & Family']

In [179]:
xl = set([len(r) for r in x])
xl

{3, 5, 6, 8, 9, 11, 18}

In [219]:
all_data['Clean Categories'] = x
all_data.rename(columns = {'Clean Categories':'MovieGenre'}, inplace = True)

In [220]:
all_data = all_data[all_data.MovieGenre != "NaN"].reset_index(drop=True)
all_data.drop(["Unnamed: 0"], axis=1)

Unnamed: 0,Movie,MovieGenre,User,Score
0,B000NDFLWG,Documentary,A3GYVMUS7SSDFW,5.0
1,B000NDFLWG,Documentary,AJYGQV81FSFE2,4.0
2,B000NDFLWG,Documentary,A39F5HLWQEZD7E,5.0
3,B000NDFLWG,Documentary,A21SBJ6QU1C4IX,5.0
4,B000NDFLWG,Documentary,A1TK6WNUIAEQRU,4.0
...,...,...,...,...
468739,B003S1UNZU,Documentary,A3KP81PWE0JH5E,5.0
468740,B003S1UNZU,Documentary,A3AD071UEP20WS,5.0
468741,B003S1UNZU,Documentary,A3IPZPME7QYQS6,5.0
468742,B003S1UNZU,Documentary,A1PNWMXODZ049K,5.0


In [221]:
all_data.to_csv('10%Dataset.csv')

## Dictionary with Movie as Key and Genre as Value

In [20]:
data_for_clusters =  pd.read_csv("10%Dataset.csv") 

In [21]:
data_for_clusters = data_for_clusters[["Movie", "MovieGenre"]]
data_for_clusters.head(10)

Unnamed: 0,Movie,MovieGenre
0,B000NDFLWG,Documentary
1,B000NDFLWG,Documentary
2,B000NDFLWG,Documentary
3,B000NDFLWG,Documentary
4,B000NDFLWG,Documentary
5,B000NDFLWG,Documentary
6,B000NDFLWG,Documentary
7,B000NDFLWG,Documentary
8,B000NDFLWG,Documentary
9,B000NDFLWG,Documentary


In [22]:
dataclusters = data_for_clusters.drop_duplicates().reset_index(drop=True)
dataclusters.head(10)

Unnamed: 0,Movie,MovieGenre
0,B000NDFLWG,Documentary
1,B008FPU7AA,Horror
2,1562229567,Documentary
3,1888617047,Musicals
4,6305508569,Action
5,6304474415,Documentary
6,B0006FFRD4,Anime & Manga
7,B003BUAP10,Documentary
8,B000BB1NFO,Documentary
9,B00020HBN2,Anime & Manga


In [23]:
print(len(data_for_clusters))
print(len(dataclusters))

468744
21788


In [24]:
keys = dataclusters["Movie"]
values = dataclusters["MovieGenre"]

In [25]:
dict_mvs_genre = dict(zip(keys, values))