In [1]:
import re
import gzip
import shutil
import pandas as pd

In [None]:
with gzip.open('movies.txt.gz', 'rb') as f_in:
    with open('movies.txt', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [2]:
with open('movies.txt', errors='ignore') as f: # errors replace
    lines = f.readlines()

In [3]:
lines[:9]

['product/productId: B003AI2VGA\n',
 'review/userId: A141HP4LYPWMSR\n',
 'review/profileName: Brian E. Erland "Rainbow Sphinx"\n',
 'review/helpfulness: 7/7\n',
 'review/score: 3.0\n',
 'review/time: 1182729600\n',
 'review/summary: "There Is So Much Darkness Now ~ Come For The Miracle"\n',
 'review/text: Synopsis: On the daily trek from Juarez, Mexico to El Paso, Texas an ever increasing number of female workers are found raped and murdered in the surrounding desert. Investigative reporter Karina Danes (Minnie Driver) arrives from Los Angeles to pursue the story and angers both the local police and the factory owners who employee the undocumented aliens with her pointed questions and relentless quest for the truth.<br /><br />Her story goes nationwide when a young girl named Mariela (Ana Claudia Talancon) survives a vicious attack and walks out of the desert crediting the Blessed Virgin for her rescue. Her story is further enhanced when the "Wounds of Christ" (stigmata) appear in her 

In [5]:
clean_reviews = [line for line in lines if re.findall(r"product/productId|review/userId|review/score", line) != []]

In [6]:
clean_reviews[:9]

['product/productId: B003AI2VGA\n',
 'review/userId: A141HP4LYPWMSR\n',
 'review/score: 3.0\n',
 'product/productId: B003AI2VGA\n',
 'review/userId: A328S9RN3U5M68\n',
 'review/score: 3.0\n',
 'product/productId: B003AI2VGA\n',
 'review/userId: A1I7QGUDP043DG\n',
 'review/score: 5.0\n']

In [12]:
num_prodcts = len([line for line in clean_reviews if "product/productId" in line])
num_users = len([line for line in clean_reviews if "review/userId" in line])
num_score = len([line for line in clean_reviews if "review/score" in line])

In [14]:
if num_prodcts == num_users == num_score:
    print(num_prodcts, "complete reviews with each three pieces of information")

7911684 reviews with each three pieces of information


In [16]:
cat_list = ['product/productId: ', 'review/userId: ', 'review/score: ', '\n']
final = [re.sub("|".join(cat_list), "", lne) for lne in clean_reviews]

In [19]:
# !! This code line takes really long to run
chunks = [final[i:i+3] for i in range(0, len(final), 3)]

In [20]:
chunks[:3]

[['B003AI2VGA', 'A141HP4LYPWMSR', '3.0'],
 ['B003AI2VGA', 'A328S9RN3U5M68', '3.0'],
 ['B003AI2VGA', 'A1I7QGUDP043DG', '5.0']]

In [21]:
all_movies = set([rvw[0] for rvw in chunks])
all_users = set([rvw[1] for rvw in chunks])
all_stars = set([rvw[2] for rvw in chunks])

In [22]:
print("Number of different movies reviewed:", len(all_movies))
print("Number of different users:", len(all_users))
print("Number of different scores:", len(all_stars))

Number of different movies reviewed: 253059
Number of different users: 889176
Number of different scores: 5


In [23]:
all_stars

{'1.0', '2.0', '3.0', '4.0', '5.0'}

In [24]:
good_reviews = [rvw for rvw in chunks if rvw[2]!='1.0' and rvw[2]!='2.0']

In [26]:
len(good_reviews)

6826953

In [27]:
good_reviews_alt = [rvw for rvw in good_reviews if rvw[2]!='3.0']
len(good_reviews_alt)

6035359

In [29]:
# !! Similar as above
review_dict = [{rvw[1]: [rvw[0], rvw[2]]} for rvw in good_reviews_alt]

In [36]:
df = pd.DataFrame(good_reviews_alt, columns=["Movie", "User", "Score"])

In [39]:
df.head(10)

Unnamed: 0,Movie,User,Score
0,B003AI2VGA,A1I7QGUDP043DG,5.0
1,B00006HAXW,AD4CDZK7D31XP,5.0
2,B00006HAXW,A3Q4S5DFVPB70D,5.0
3,B00006HAXW,A2P7UB02HAVEPB,5.0
4,B00006HAXW,A2TX99AZKDK0V7,4.0
5,B00006HAXW,AFC8IKR407HSK,5.0
6,B00006HAXW,A1FRPGQYQTAOR1,5.0
7,B00006HAXW,A1RSDE90N6RSZF,5.0
8,B00006HAXW,A1OUBOGB5970AO,4.0
9,B00006HAXW,A3NPHQVIY59Y0Y,5.0


In [40]:
num_rvw_users = df['User'].value_counts()

In [62]:
print(type(num_rvw_users))
print(len(num_rvw_users))
num_rvw_users[:10]

<class 'pandas.core.series.Series'>
738205


A16CZRQL23NOIW    10151
A2NJO6YE954DBH     8127
A10ODC971MDHV8     7548
A35ZK3M8L9JUPX     7507
A39CX0EE4BZCZC     6641
ANCOMAI0I7LVG      6585
A328S9RN3U5M68     6085
A3LZGLA88K0LA0     5880
A3UDYY6L2NH3JS     5737
AIMR915K4YCN       5674
Name: User, dtype: int64

In [63]:
dict_user = dict(num_rvw_users)

In [65]:
over_one = [key for key in dict_user.keys() if dict_user[key]!=1]
len(over_one)

505797

In [66]:
final_df = df[df['User'].isin(over_one)].reset_index(drop=True)

In [67]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5802951 entries, 0 to 5802950
Data columns (total 3 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   Movie   object
 1   User    object
 2   Score   object
dtypes: object(3)
memory usage: 132.8+ MB


In [68]:
final_df.head(10)

Unnamed: 0,Movie,User,Score
0,B003AI2VGA,A1I7QGUDP043DG,5.0
1,B00006HAXW,AD4CDZK7D31XP,5.0
2,B00006HAXW,A3Q4S5DFVPB70D,5.0
3,B00006HAXW,A2P7UB02HAVEPB,5.0
4,B00006HAXW,A2TX99AZKDK0V7,4.0
5,B00006HAXW,AFC8IKR407HSK,5.0
6,B00006HAXW,A1FRPGQYQTAOR1,5.0
7,B00006HAXW,A1RSDE90N6RSZF,5.0
8,B00006HAXW,A1OUBOGB5970AO,4.0
9,B00006HAXW,A3NPHQVIY59Y0Y,5.0
