In [1]:
import pandas as pd # pandas is a data manipulation library
import numpy as np #provides numerical arrays and functions to manipulate the arrays efficiently
import random
import matplotlib.pyplot as plt # data visualization library
import operator

In [2]:
m_cols = ['movie_id','movie title','release date','video release date','IMDb URL'
          ,'unknown','Action','Adventure','Animation','Childrens','Comedy',
          'Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi',
        'Thriller','War','Western']
data_u_item = pd.read_csv('ml-100k/u.item',delimiter='|',names=m_cols,encoding='latin-1')

In [3]:
data_u_item.head()

Unnamed: 0,movie_id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
data_u_item.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 24 columns):
movie_id              1682 non-null int64
movie title           1682 non-null object
release date          1681 non-null object
video release date    0 non-null float64
IMDb URL              1679 non-null object
unknown               1682 non-null int64
Action                1682 non-null int64
Adventure             1682 non-null int64
Animation             1682 non-null int64
Childrens             1682 non-null int64
Comedy                1682 non-null int64
Crime                 1682 non-null int64
Documentary           1682 non-null int64
Drama                 1682 non-null int64
Fantasy               1682 non-null int64
Film-Noir             1682 non-null int64
Horror                1682 non-null int64
Musical               1682 non-null int64
Mystery               1682 non-null int64
Romance               1682 non-null int64
Sci-Fi                1682 non-null int64
Thriller 

In [5]:
#explore users
u_cols = ['user_id','age','gender','occupation','zip_code']
data_u_users = pd.read_csv('ml-100k/u.user',delimiter='|',names=u_cols,encoding='latin-1')

In [6]:
data_u_users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [7]:
#explore ratings
r_cols = ['user_id','movie_id','rating','timestamp']
data_rating_u1_base = pd.read_csv('ml-100k/u1.base',delimiter='\t',names=r_cols,encoding='latin-1')

In [8]:
data_rating_u1_base.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [9]:
# merge movies and rating data
movies_ratings_data = data_u_item.merge(data_rating_u1_base,on = 'movie_id',how = 'inner')

# 2

## a

In [20]:
def movies_by_mean_ratings(movies_and_ratings_data):
    movies_mean_ratings = movies_and_ratings_data[['movie_id','movie title','rating']].groupby(['movie_id','movie title']).mean()
    movies_mean_ratings = movies_mean_ratings.sort_values(['rating'],ascending=False)
    return movies_mean_ratings

In [21]:
movies_by_mean_ratings(movies_ratings_data).head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
movie_id,movie title,Unnamed: 2_level_1
1189,Prefontaine (1997),5.0
1653,Entertaining Angels: The Dorothy Day Story (1996),5.0
1599,Someone Else's America (1995),5.0


## b

In [22]:
def movies_by_count_ratings(movies_and_ratings_data):
    movies_ratings_count = movies_and_ratings_data[['movie_id','movie title','rating']].groupby(['movie_id','movie title']).count()
    movies_ratings_count = movies_ratings_count.sort_values(['rating'],ascending=False)
    return movies_ratings_count

In [23]:
movies_by_count_ratings(movies_ratings_data).head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
movie_id,movie title,Unnamed: 2_level_1
50,Star Wars (1977),484
181,Return of the Jedi (1983),422
258,Contact (1997),402


## c

In [24]:
def movies_by_rating_percent(movies_and_ratings_data):
    count_rating = movies_and_ratings_data[['movie_id','movie title','rating']].groupby(['movie_id','movie title']).count()
    count_rating = count_rating.reset_index()
    
    count_rating_big_than_4 = movies_and_ratings_data[['movie_id','movie title','rating']][movies_and_ratings_data.rating > 4].groupby(['movie_id','movie title']).count()
    count_rating_big_than_4 = count_rating_big_than_4.reset_index()
    
    movies_by_rating_percent  = pd.DataFrame(columns = ['movie_id','movie title','rating_percent'])
    for index,row in count_rating_big_than_4.iterrows():
        movie_id = row.movie_id
        movie_title = row['movie title']
        movie_rate_count = row.rating

        total_rating = count_rating.loc[count_rating['movie_id'] == movie_id].rating.values[0]
        rating_percent = (float(movie_rate_count) / total_rating) * 100

        movie = pd.DataFrame([[movie_id, movie_title, int(rating_percent)]], columns = ['movie_id','movie title','rating_percent'])
        movies_by_rating_percent = movies_by_rating_percent.append(movie, ignore_index=True)

    movies_by_rating_percent = movies_by_rating_percent.sort_values(['rating_percent'],ascending=False)
    return movies_by_rating_percent

In [25]:
movies_by_rating_percent(movies_ratings_data).head(3)

Unnamed: 0,movie_id,movie title,rating_percent
1074,1467,"Saint of Fort Washington, The (1993)",100
1113,1653,Entertaining Angels: The Dorothy Day Story (1996),100
1085,1500,Santa with Muscles (1996),100


## d

In [26]:
import math
def new_rating_cala(mean,count):
    new = (math.log10(count) + 1) * mean
    new = (new / 5) + 1.78718
    return new

In [27]:
def movies_by_new_ratings(movies_and_ratings_data):
    new_ratings  = pd.DataFrame(columns = ['movie_id','movie title','rating'])

    mean_ratings = movies_by_mean_ratings(movies_and_ratings_data).reset_index()
    count_ratings = movies_by_count_ratings(movies_and_ratings_data).reset_index()
    for index,row in count_ratings.iterrows():
        movie_id = row.movie_id
        movie_title = row['movie title']
        movie_rate_count = row.rating
        movie_rate_mean = mean_ratings.loc[mean_ratings['movie_id'] == movie_id].rating.values[0]

        movie = pd.DataFrame([[movie_id, movie_title, new_rating_cala(movie_rate_mean,movie_rate_count)]], columns = ['movie_id','movie title','rating'])
        new_ratings = new_ratings.append(movie)

    new_ratings = new_ratings.sort_values(['rating'],ascending=False)
    return new_ratings

In [28]:
movies_by_new_ratings(movies_ratings_data).head(3)

Unnamed: 0,movie_id,movie title,rating
0,50,Star Wars (1977),5.0
0,318,Schindler's List (1993),4.817334
0,127,"Godfather, The (1972)",4.795169


## e

In [29]:
#merge movies ratings data with users
movie_ratings_data_and_users = movies_ratings_data.merge(data_u_users,on = 'user_id',how = 'inner')

## MANS

In [30]:
mans_rating = movie_ratings_data_and_users[['movie_id','movie title','rating']][movie_ratings_data_and_users.gender == 'M']

In [39]:
mans_rating.head()

Unnamed: 0,movie_id,movie title,rating
0,1,Toy Story (1995),5
1,2,GoldenEye (1995),3
2,3,Four Rooms (1995),4
3,4,Get Shorty (1995),3
4,5,Copycat (1995),3


## e, a

In [40]:
movies_by_mean_ratings(mans_rating).head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
movie_id,movie title,Unnamed: 2_level_1
1500,Santa with Muscles (1996),5.0
1189,Prefontaine (1997),5.0
1656,Little City (1998),5.0


## e, b

In [41]:
movies_by_count_ratings(mans_rating).head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
movie_id,movie title,Unnamed: 2_level_1
50,Star Wars (1977),359
181,Return of the Jedi (1983),318
100,Fargo (1996),300


## e, c

In [42]:
movies_by_rating_percent(mans_rating).head(3)

Unnamed: 0,movie_id,movie title,rating_percent
983,1656,Little City (1998),100
887,1201,Marlene Dietrich: Shadow and Light (1996),100
962,1500,Santa with Muscles (1996),100


## e, d

In [43]:
movies_by_new_ratings(mans_rating).head(3)

Unnamed: 0,movie_id,movie title,rating
0,50,Star Wars (1977),4.89863
0,127,"Godfather, The (1972)",4.730887
0,174,Raiders of the Lost Ark (1981),4.718803


## womans

In [44]:
woman_ratings = movie_ratings_data_and_users[['movie_id','movie title','rating']][movie_ratings_data_and_users.gender == 'F']

In [45]:
woman_ratings.head()

Unnamed: 0,movie_id,movie title,rating
135,1,Toy Story (1995),4
136,10,Richard III (1995),2
137,14,"Postino, Il (1994)",4
138,25,"Birdcage, The (1996)",4
139,100,Fargo (1996),5


## e, a

In [46]:
movies_by_mean_ratings(woman_ratings).head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
movie_id,movie title,Unnamed: 2_level_1
1594,Everest (1998),5.0
1150,Last Dance (1996),5.0
1301,Stripes (1981),5.0


## e, b

In [47]:
movies_by_count_ratings(woman_ratings).head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
movie_id,movie title,Unnamed: 2_level_1
50,Star Wars (1977),125
286,"English Patient, The (1996)",125
288,Scream (1996),118


## e, c

In [48]:
movies_by_rating_percent(woman_ratings).head(3)

Unnamed: 0,movie_id,movie title,rating_percent
66,74,Faster Pussycat! Kill! Kill! (1965),100
845,1594,Everest (1998),100
634,838,In the Line of Duty 2 (1987),100


## e, d

In [49]:
movies_by_new_ratings(woman_ratings).head(3)

Unnamed: 0,movie_id,movie title,rating
0,50,Star Wars (1977),4.457955
0,318,Schindler's List (1993),4.381402
0,98,"Silence of the Lambs, The (1991)",4.307867


## Conclusions:

### Are there differences in mean values between two populations?

In [50]:
mans_rating[['rating']].mean()

rating    3.529115
dtype: float64

In [52]:
woman_ratings[['rating']].mean()

rating    3.526187
dtype: float64

### What are the three most popular movies among women?

In [53]:
movies_by_new_ratings(woman_ratings).head(3)

Unnamed: 0,movie_id,movie title,rating
0,50,Star Wars (1977),4.457955
0,318,Schindler's List (1993),4.381402
0,98,"Silence of the Lambs, The (1991)",4.307867


### What are the three most popular movies among men?

In [54]:
movies_by_new_ratings(mans_rating).head(3)

Unnamed: 0,movie_id,movie title,rating
0,50,Star Wars (1977),4.89863
0,127,"Godfather, The (1972)",4.730887
0,174,Raiders of the Lost Ark (1981),4.718803


# 3

## a

### build model

In [55]:
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD
from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import mean_absolute_error

#Generic function for making a classification model and accessing performance:
def classification_model(model, train,test, predictors, outcome):
    #Fit the model:
    predictions = model.fit(train[predictors],train[outcome]).predict(test[predictors])

    #Print accuracy
    accuracy = metrics.accuracy_score(predictions,test[outcome])
    print("Accuracy : %s" % "{0:.3%}".format(accuracy))

    print "MAE: {}".format(mean_absolute_error(test[outcome], predictions))

### prepare u1.test

In [56]:
r_cols = ['user_id','movie_id','rating','timestamp']
test = pd.read_csv('ml-100k/u1.test',delimiter='\t',names=r_cols,encoding='latin-1')
test = test[['movie_id','rating']].groupby('movie_id').mean()
test = test.reset_index()
test.head()

Unnamed: 0,movie_id,rating
0,1,3.797101
1,2,3.307692
2,3,3.2
3,4,3.657895
4,5,3.294118


In [57]:
# merge u1.test (ranking) with movies
test = data_u_item.merge(test,on = 'movie_id',how = 'inner')
test = test.drop(['movie title', 'video release date', 'IMDb URL'], axis=1)

In [58]:
test.head()

Unnamed: 0,movie_id,release date,unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,rating
0,1,01-Jan-1995,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,3.797101
1,2,01-Jan-1995,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,3.307692
2,3,01-Jan-1995,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,3.2
3,4,01-Jan-1995,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,3.657895
4,5,01-Jan-1995,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,3.294118


In [59]:
# need to change value type. for rating and release date
from sklearn.preprocessing import LabelEncoder
def factrozied(data):
    le = LabelEncoder()
    data['rating'] = le.fit_transform(data['rating'].astype(long)) + 1
    data['release date'] = le.fit_transform(data['release date'].astype(str))
    return data

In [60]:
test = factrozied(test)
test.head(3)

Unnamed: 0,movie_id,release date,unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,rating
0,1,67,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,3
1,2,67,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,3
2,3,67,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,3


## AdaBoost

In [61]:
def prepare_train(train):
    train = train.reset_index()
    train = data_u_item.merge(train,on = ['movie_id','movie title'],how = 'inner')
    train = train.drop(['movie title', 'video release date', 'IMDb URL'], axis=1)
    train = factrozied(train)
    return train

### 1: using Q2 - a

In [62]:
# prepare train:
mean_movie_ratings = movies_by_mean_ratings(movies_ratings_data)
mean_movie_ratings = prepare_train(mean_movie_ratings)
mean_movie_ratings.head(3)

Unnamed: 0,movie_id,release date,unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,rating
0,1,69,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,3
1,2,69,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,3
2,3,69,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,3


In [63]:
outcome_var = ['rating']
model = AdaBoostClassifier(n_estimators=20)
predictor_var = ['release date','unknown','Action','Adventure','Animation','Childrens','Comedy',
          'Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi',
        'Thriller','War','Western']
classification_model(model, mean_movie_ratings,test,predictor_var,outcome_var)

Accuracy : 47.163%
MAE: 0.628368794326


  y = column_or_1d(y, warn=True)


### 2: using Q2 - d

In [64]:
# prepare train:
mean_my_movie_ratings = movies_by_new_ratings(movies_ratings_data)
mean_my_movie_ratings = prepare_train(mean_my_movie_ratings)
mean_my_movie_ratings.head(3)

Unnamed: 0,movie_id,release date,unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,index,rating
0,1,69,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,4
1,2,69,0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,3
2,3,69,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,3


In [65]:
outcome_var = ['rating']
model = AdaBoostClassifier(n_estimators=20)
predictor_var = ['release date','unknown','Action','Adventure','Animation','Childrens','Comedy',
          'Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi',
        'Thriller','War','Western']
classification_model(model, mean_my_movie_ratings,test,predictor_var,outcome_var)

Accuracy : 43.050%
MAE: 0.703546099291


# 3

## b

In [66]:
data_rating_u1_base.head(3)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960


In [None]:
from collections import defaultdict
all_user = {}
all_user = defaultdict(list)
for index,row in data_rating_u1_base.iterrows():
    all_user[row[0]].append(row[1])

In [None]:
def calc_recall_precision():
    recall = 0
    precision = 0
    return recall,precision

In [None]:
movies_rate_a = movies_by_mean_ratings(movies_ratings_data).reset_index()['movie_id'].values.tolist()
movies_rate_a = map(lambda x: int(x),movies_rate_a)

In [None]:
movies_rate_b = movies_by_count_ratings(movies_ratings_data).reset_index()['movie_id'].values.tolist()
movies_rate_b = map(lambda x: int(x),movies_rate_b)

In [None]:
movies_rate_c = movies_by_rating_percent(movies_ratings_data).reset_index()['movie_id'].values.tolist()
movies_rate_c = map(lambda x: int(x),movies_rate_c)

In [None]:
movies_rate_d = movies_by_new_ratings(movies_ratings_data).reset_index()['movie_id'].values.tolist()
movies_rate_d = map(lambda x: int(x),movies_rate_d)

In [None]:
movies_rate_random = movies_ratings_data.reset_index()['movie_id'].values.tolist()
movies_rate_random = map(lambda x: int(x),movies_rate_random)

In [None]:
# limit the running to only 10 users
all_users_recomendations = {}
i = 0
for user_id, movie_list in all_user.iteritems():
    i = i + 1
    if i == 10:
        break
    new_movies_rate_a = [item for item in movies_rate_a if item not in movie_list][0:20]
    new_movies_rate_b = [item for item in movies_rate_b if item not in movie_list][0:20]
    new_movies_rate_c = [item for item in movies_rate_c if item not in movie_list][0:20]
    new_movies_rate_d = [item for item in movies_rate_d if item not in movie_list][0:20]
    new_movies_rate_random = [item for item in movies_rate_random if item not in movie_list][0:20]
    all_users_recomendations[user_id] = [new_movies_rate_a,new_movies_rate_b,new_movies_rate_c,new_movies_rate_d,new_movies_rate_random]

## example: 20 movie_ids recomended to user 1 according to mean rate

In [None]:
all_users_recomendations[1][0]