In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd

In [2]:
df = pd.read_csv('/Volumes/external/Sangeetha-Project/reviews3.csv',
                 names=['reviwerId', 'asin', 'review', 'rating',
                        'summary', 'unixtime', 'pos_votes', 'total_votes']) \
    .drop(columns=['review', 'summary', 'unixtime', 'pos_votes', 'total_votes']) \
    .astype({'rating': 'int8'})

In [3]:
df.head()

Unnamed: 0,reviwerId,asin,rating
0,ARX8A6TH2TC6U,1481948377,5
1,AA7F1IDEW0CW1,1481948377,5
2,A2WXRE0E5U02F2,1481948474,5
3,A1CZE84Z3882CS,1481948474,5
4,A26K553QITEKI5,1481948474,4


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2888041 entries, 0 to 2888040
Data columns (total 3 columns):
reviwerId    object
asin         object
rating       int8
dtypes: int8(1), object(2)
memory usage: 46.8+ MB


In [5]:
small_df = df.iloc[:1000].copy()

In [6]:
small_df.head()

Unnamed: 0,reviwerId,asin,rating
0,ARX8A6TH2TC6U,1481948377,5
1,AA7F1IDEW0CW1,1481948377,5
2,A2WXRE0E5U02F2,1481948474,5
3,A1CZE84Z3882CS,1481948474,5
4,A26K553QITEKI5,1481948474,4


In [7]:
small_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
reviwerId    1000 non-null object
asin         1000 non-null object
rating       1000 non-null int8
dtypes: int8(1), object(2)
memory usage: 16.7+ KB


In [8]:
print('Reviewers under 5: ', (df.reviwerId.value_counts() < 5).mean())
print('Mean reviewers: ', df.reviwerId.value_counts().mean())
print('Median reviewers: ', df.reviwerId.value_counts().median())
print('-----------------------------------------------')
print('Books under 5: ', (df.asin.value_counts() < 5).mean())
print('Mean books: ', df.asin.value_counts().mean())
print('Median books: ', df.asin.value_counts().median())

Reviewers under 5:  0.6921917277909437
Mean reviewers:  5.9155348177946925
Median reviewers:  3.0
-----------------------------------------------
Books under 5:  6.522433911438392e-06
Mean books:  18.837056556024447
Median books:  9.0


In [9]:
(small_df.asin.value_counts() == 1).mean()

0.0

In [10]:
counts = small_df.asin.value_counts()

In [11]:
list(counts[counts == 5].index)

['1481951181']

In [12]:
id_counts = df.reviwerId.value_counts()

In [13]:
book_counts= df.asin.value_counts()
book_counts

1940026016    2440
1612186009    1622
1611735270    1523
1594744769    1463
1495307352    1449
              ... 
1571741119       5
B00D0N20HK       5
B00947BD92       5
1560103450       5
1481948377       2
Name: asin, Length: 153317, dtype: int64

In [14]:
id_counts_1 = list(id_counts[id_counts <3].index)
id_counts_1[:5]
len(id_counts_1)

216686

In [15]:
df[df.reviwerId.isin(id_counts_1)]

Unnamed: 0,reviwerId,asin,rating
14,A3VTSWNTTKJ4MP,1481948474,4
22,AJRLGOL01MSNJ,1481948474,5
24,A1F5E305E57YUI,1481948474,3
35,A2KB8TVAOTOC9G,1481948474,3
51,A19MFDAQP7PK6F,1481948474,4
...,...,...,...
2887252,A1VGWBYW8NTYTR,B00LWF3UYU,5
2887331,A25BRDOJQP1L77,B00LWSNUQ0,5
2887462,AX2KBOLAVR5NX,B00LY51TPA,5
2887970,A1V8ZTQUBCCM1W,B00LZKMXBI,5


In [16]:
df_dense = df.drop(index = df[df.reviwerId.isin(id_counts_1)].index)

In [17]:
df_dense.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2574378 entries, 0 to 2888040
Data columns (total 3 columns):
reviwerId    object
asin         object
rating       int8
dtypes: int8(1), object(2)
memory usage: 61.4+ MB


In [18]:
print('Reviewers under 5: ', (df_dense.reviwerId.value_counts() < 5).mean())
print('Mean reviewers: ', df_dense.reviwerId.value_counts().mean())
print('Median reviewers: ', df_dense.reviwerId.value_counts().median())
print('-----------------------------------------------')
print('Books under 5: ', (df_dense.asin.value_counts() < 5).mean())
print('Mean books: ', df_dense.asin.value_counts().mean())
print('Median books: ', df_dense.asin.value_counts().median())

Reviewers under 5:  0.44655227656918095
Mean reviewers:  9.48111237556486
Median reviewers:  5.0
-----------------------------------------------
Books under 5:  0.11695429484879132
Mean books:  16.796907317391447
Median books:  8.0


In [19]:
df_dense.asin.value_counts()

1940026016    1827
1495307352    1406
149539090X    1284
1490559175    1178
149043013X    1147
              ... 
1888698381       1
1587761459       1
159020462X       1
1609450841       1
1936041189       1
Name: asin, Length: 153265, dtype: int64

In [20]:
df_dense.reviwerId.value_counts()

A14OJS0VWMOSWO    5533
AFVQZQ8PW0L       1976
A13QTZ8CIMHHG4    1803
A320TMDV6KCFU     1573
A328S9RN3U5M68    1269
                  ... 
A2TLH3Z00S1SVJ       3
A1HTN5D60VMN2K       3
AI3QWTP63ZKX7        3
AE8NVOMR975UW        3
A5WYNVGEOGV4T        3
Name: reviwerId, Length: 271527, dtype: int64

In [21]:
len(df_dense.asin.unique())

153265

In [22]:
len(df_dense.reviwerId.unique())

271527