In [1]:
import numpy as np
import pandas as pd

In [2]:
import seaborn as sns

In [37]:
movies_path = "ml-1m/movies.dat"
users_path = "ml-1m/users.dat"

In [40]:
movies = pd.read_csv(movies_path,sep="::",engine='python',header=None,names=["MovieID","Title","Genres"])
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [41]:
movies.shape

(3883, 3)

In [42]:
users = pd.read_csv(users_path,sep="::",engine='python',header=None,
                    names=["UserID","Gender","Age","Occupation","Zip-code"])
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [43]:
movies = movies.set_index(movies.MovieID).drop(["MovieID"],axis = 1)

In [44]:
movies.head()

Unnamed: 0_level_0,Title,Genres
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy


In [45]:
users = users.set_index("UserID")
users.head()

Unnamed: 0_level_0,Gender,Age,Occupation,Zip-code
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,F,1,10,48067
2,M,56,16,70072
3,M,25,15,55117
4,M,45,7,2460
5,M,25,20,55455


In [46]:
import re

In [47]:
movies[["Title","Year"]] = movies.Title.str.extract(r"(.*)\s\((\d{4})\)",expand=None)

  if __name__ == '__main__':


In [48]:
movies.head()

Unnamed: 0_level_0,Title,Genres,Year
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story,Animation|Children's|Comedy,1995
2,Jumanji,Adventure|Children's|Fantasy,1995
3,Grumpier Old Men,Comedy|Romance,1995
4,Waiting to Exhale,Comedy|Drama,1995
5,Father of the Bride Part II,Comedy,1995


In [49]:
movies[movies.Year.isnull()]

Unnamed: 0_level_0,Title,Genres,Year
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [51]:
movies["Year"] = movies.Year.astype(int)

## ile jest wszystkich filmów?

In [54]:
len(movies)

3883

In [53]:
movies.dtypes

Title     object
Genres    object
Year       int64
dtype: object

## ile filmów powstało w poszczególnych latach?

In [58]:
movies.groupby("Year").count()["Title"].sort_values(ascending = False).head(10)

Year
1996    345
1995    342
1998    337
1997    315
1999    283
1994    257
1993    165
2000    156
1986    104
1992    102
Name: Title, dtype: int64

In [135]:
pd.crosstab(movies.Year,columns="count").head()

col_0,count
Year,Unnamed: 1_level_1
1919,3
1920,2
1921,1
1922,2
1923,3


## jak wygląda rozkład płci oraz grup wiekowych wśród użytkowników?

In [59]:
pd.crosstab(users.Gender,users.Age)

Age,1,18,25,35,45,50,56
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
F,78,298,558,338,189,146,102
M,144,805,1538,855,361,350,278


## jaki gatunek filmowy jest najczęstszy?

In [60]:
genres = movies.Genres.str.split('|',expand=True).values.ravel()
genres

array(['Animation', "Children's", 'Comedy', ..., None, None, None], dtype=object)

In [61]:
genres = genres[genres != np.array(None)]
pd.Series(genres).value_counts()

Drama          1603
Comedy         1200
Action          503
Thriller        492
Romance         471
Horror          343
Adventure       283
Sci-Fi          276
Children's      251
Crime           211
War             143
Documentary     127
Musical         114
Mystery         106
Animation       105
Western          68
Fantasy          68
Film-Noir        44
dtype: int64

## jaki jest najlepszy film wszechczasów, (najlepszy, czyli ma najwyższą średnią ocenę) - to zadanie możesz rozwiązać wykonując złączenie (join) zbioru movies i ratings?

In [62]:
ratings_path = "ml-1m/ratings.dat"

In [63]:
ratings = pd.read_csv(ratings_path,sep="::",engine='python',header=None,
                      names=["UserID","MovieID","Rating","Timestamp"])

In [64]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [66]:
title_rating = pd.merge(movies,ratings,left_index=True,right_on='MovieID').drop(
    ["Genres","Year","UserID","Timestamp"],axis = 1)
title_rating.head(10)

Unnamed: 0,Title,MovieID,Rating
40,Toy Story,1,5
469,Toy Story,1,4
581,Toy Story,1,4
711,Toy Story,1,5
837,Toy Story,1,5
1966,Toy Story,1,4
2276,Toy Story,1,5
2530,Toy Story,1,3
2870,Toy Story,1,4
3405,Toy Story,1,3


In [70]:
title_rating.groupby('Title')['Rating'].mean().sort_values(ascending = False).head(15)

Title
Ulysses (Ulisse)                                                5.000000
Schlafes Bruder (Brother of Sleep)                              5.000000
Smashing Time                                                   5.000000
Song of Freedom                                                 5.000000
Gate of Heavenly Peace, The                                     5.000000
Lured                                                           5.000000
Baby, The                                                       5.000000
Bittersweet Motel                                               5.000000
Follow the Bitch                                                5.000000
One Little Indian                                               5.000000
I Am Cuba (Soy Cuba/Ya Kuba)                                    4.800000
Lamerica                                                        4.750000
Apple, The (Sib)                                                4.666667
Sanjuro                                      

## wykonaj poprzedni punkt, odrzucając wcześniej filmy które nie uzyskały wystarczająco dużo głosów (np 100)?

In [71]:
title_count_rating = title_rating.groupby('Title').aggregate(['count','mean'])
title_count_rating.head()

Unnamed: 0_level_0,MovieID,MovieID,Rating,Rating
Unnamed: 0_level_1,count,mean,count,mean
Title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
"$1,000,000 Duck",37,2031.0,37,3.027027
'Night Mother,70,3112.0,70,3.371429
'Til There Was You,52,779.0,52,2.692308
"'burbs, The",303,2072.0,303,2.910891
...And Justice for All,199,3420.0,199,3.713568


In [73]:
title_count_rating[title_count_rating['Rating','count'] > 100]['Rating','mean'].sort_values(ascending = False).head(15)

Title
Seven Samurai (The Magnificent Seven) (Shichinin no samurai)            4.560510
Shawshank Redemption, The                                               4.554558
Godfather, The                                                          4.524966
Close Shave, A                                                          4.520548
Usual Suspects, The                                                     4.517106
Schindler's List                                                        4.510417
Wrong Trousers, The                                                     4.507937
Sunset Blvd. (a.k.a. Sunset Boulevard)                                  4.491489
Raiders of the Lost Ark                                                 4.477725
Rear Window                                                             4.476190
Paths of Glory                                                          4.473913
Star Wars: Episode IV - A New Hope                                      4.453694
Third Man, The        

## jaki jest najlepszy film według kobiet i według mężczyzn?

In [123]:
title_gender = pd.merge(users,ratings,left_index=True,right_on='UserID').drop(
    ["Zip-code",'Occupation','Timestamp'],axis=1)
title_gender.head()

Unnamed: 0,Gender,Age,UserID,MovieID,Rating
0,F,1,1,1193,5
1,F,1,1,661,3
2,F,1,1,914,3
3,F,1,1,3408,4
4,F,1,1,2355,5


In [124]:
rating_gender = pd.merge(title_gender,movies,left_on='MovieID',right_index=True).drop(
    ["Year",'MovieID','Genres','UserID','Age'],axis=1)
rating_gender.head()

Unnamed: 0,Gender,Rating,Title
0,F,5,One Flew Over the Cuckoo's Nest
120,M,5,One Flew Over the Cuckoo's Nest
1339,M,4,One Flew Over the Cuckoo's Nest
1518,M,4,One Flew Over the Cuckoo's Nest
1747,M,5,One Flew Over the Cuckoo's Nest


In [128]:
full_ratings = rating_gender.groupby(['Gender','Title']).aggregate(['count','mean']).reset_index()
full_ratings.columns = ["Gender", "Title", "Count", "Mean"]
full_ratings.head()

Unnamed: 0,Gender,Title,Count,Mean
0,F,"$1,000,000 Duck",16,3.375
1,F,'Night Mother,36,3.388889
2,F,'Til There Was You,37,2.675676
3,F,"'burbs, The",92,2.793478
4,F,...And Justice for All,35,3.828571


In [143]:
full_ratings = full_ratings[full_ratings.Count > 100]
full_ratings.head()

Unnamed: 0,Gender,Title,Count,Mean
6,F,10 Things I Hate About You,232,3.646552
7,F,101 Dalmatians,337,3.545994
8,F,12 Angry Men,141,4.184397
9,F,"13th Warrior, The",125,3.112
15,F,2001: A Space Odyssey,344,3.825581


In [142]:
full_ratings2 = full_ratings.sort_values(by=["Gender","Mean"],ascending=False).reset_index()
full_ratings2.drop_duplicates(["Gender"]).drop(["index"],axis=1)

Unnamed: 0,Gender,Title,Count,Mean
0,M,"Godfather, The",1740,4.583333
1733,F,"Close Shave, A",180,4.644444


## jaki jest średni rok oglądanego filmu w poszczególnych grupach wiekowych?

In [148]:
title_age = pd.merge(users,ratings,left_index=True,right_on='UserID').drop(
    ["Zip-code",'Occupation','Timestamp','Gender','Rating'],axis=1)
title_age.head()

Unnamed: 0,Age,UserID,MovieID
0,1,1,1193
1,1,1,661
2,1,1,914
3,1,1,3408
4,1,1,2355


In [149]:
rating_age= pd.merge(title_age,movies,left_on='MovieID',right_index=True).drop(
    ['Genres','UserID'],axis=1)
rating_age.head()

Unnamed: 0,Age,MovieID,Title,Year
0,1,1193,One Flew Over the Cuckoo's Nest,1975
120,56,1193,One Flew Over the Cuckoo's Nest,1975
1339,25,1193,One Flew Over the Cuckoo's Nest,1975
1518,25,1193,One Flew Over the Cuckoo's Nest,1975
1747,50,1193,One Flew Over the Cuckoo's Nest,1975


In [150]:
rating_age.dtypes

Age         int64
MovieID     int64
Title      object
Year        int64
dtype: object

In [155]:
pd.DataFrame(rating_age.groupby('Age')['Year'].mean())

Unnamed: 0_level_0,Year
Age,Unnamed: 1_level_1
1,1988.981699
18,1989.701982
25,1987.972972
35,1984.965478
45,1983.50052
50,1982.483211
56,1981.549097


## jakie trzy gatunki filmowe są najczęściej oglądane przez kobiety i mężczyzn?

In [163]:
users_movie = pd.merge(users,ratings,left_index=True,right_on="UserID").drop([
    "Age","Occupation","Zip-code","Rating"],axis=1)
users_movie.head()

Unnamed: 0,Gender,UserID,MovieID,Timestamp
0,F,1,1193,978300760
1,F,1,661,978302109
2,F,1,914,978301968
3,F,1,3408,978300275
4,F,1,2355,978824291


In [169]:
gender_genres = pd.merge(users_movie,movies,left_on="MovieID",right_index=True).drop([
    "Year","UserID","MovieID","Title","Timestamp"],axis=1)
gender_genres.head()

Unnamed: 0,Gender,Genres
0,F,Drama
120,M,Drama
1339,M,Drama
1518,M,Drama
1747,M,Drama


In [174]:
f_genres = gender_genres[gender_genres.Gender == "F"]
f_genres.tail()

Unnamed: 0,Gender,Genres
883623,F,Drama
896452,F,Children's|Comedy
898557,F,Action|Sci-Fi|Thriller
908506,F,Comedy
970914,F,Comedy|Drama|Western


In [175]:
m_genres = gender_genres[gender_genres.Gender == "M"]
m_genres.tail()

Unnamed: 0,Gender,Genres
919876,M,Documentary
984335,M,Documentary
940262,M,Drama
957826,M,Drama
983062,M,Documentary


In [192]:
pd.Series(f_genres.Genres.str.split("|",expand=True).values.ravel()).dropna().value_counts()

Drama          98153
Comedy         96271
Romance        50297
Action         45650
Thriller       40308
Sci-Fi         27400
Adventure      27332
Children's     21317
Crime          16442
Horror         14635
War            14093
Musical        13505
Animation      12221
Mystery         9976
Fantasy         8718
Film-Noir       4202
Western         3477
Documentary     1940
dtype: int64

In [193]:
pd.Series(m_genres.Genres.str.split("|",expand=True).values.ravel()).dropna().value_counts()

Comedy         260309
Drama          256376
Action         211807
Thriller       149372
Sci-Fi         129894
Adventure      106621
Romance         97226
Crime           63099
Horror          61751
War             54434
Children's      50869
Animation       31072
Mystery         30202
Musical         28028
Fantasy         27583
Western         17206
Film-Noir       14059
Documentary      5970
dtype: int64