## Data Scrape Quiz

In [1]:
from IPython.display import HTML
import numpy as np
from urllib.request import urlopen
import bs4
import time
import operator
import socket
import pickle
import re

from pandas import Series
import pandas as pd
from pandas import DataFrame

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_context("talk")
sns.set_style("white")

In [2]:
# read Users Dataframe
user_cols = ["user_id", "age", "sex", "occupation", "zip_code"]

users_df = pd.read_csv("data/users.csv", header=None, sep="|", names=user_cols)

users_df.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [3]:
# read Ratings Dataframe
ratings_cols = ["user_id", "movie_id", "rating", "unix_timestamp"]

ratings_df = pd.read_csv("data/ratings.csv", header=None, sep="\t", names=ratings_cols)

ratings_df.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
# read Movies Dataframe
movies_cols = ["movie_id", "title", "release_date", "video_release_date", "imdb_url"]

movies_df = pd.read_csv("data/movies.csv", header=None, sep="|", names=movies_cols, usecols=range(5))

movies_df.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0


In [5]:
users_df.dtypes

user_id        int64
age            int64
sex           object
occupation    object
zip_code      object
dtype: object

In [6]:
ratings_df.dtypes

user_id           int64
movie_id          int64
rating            int64
unix_timestamp    int64
dtype: object

In [7]:
movies_df.dtypes

movie_id               int64
title                 object
release_date          object
video_release_date    object
imdb_url               int64
dtype: object

In [8]:
# Generates descriptive statistics that summarize the central tendency, dispersion and shape of a
# dataset’s distribution, excluding NaN values.
# only display columns comprised of numerical values
print(users_df.describe())
print()
print(movies_df.describe())
print()
print(ratings_df.describe())


          user_id         age
count  943.000000  943.000000
mean   472.000000   34.051962
std    272.364951   12.192740
min      1.000000    7.000000
25%    236.500000   25.000000
50%    472.000000   31.000000
75%    707.500000   43.000000
max    943.000000   73.000000

          movie_id     imdb_url
count  1682.000000  1682.000000
mean    841.500000     0.000595
std     485.695893     0.024383
min       1.000000     0.000000
25%     421.250000     0.000000
50%     841.500000     0.000000
75%    1261.750000     0.000000
max    1682.000000     1.000000

            user_id       movie_id         rating  unix_timestamp
count  100000.00000  100000.000000  100000.000000    1.000000e+05
mean      462.48475     425.530130       3.529860    8.835289e+08
std       266.61442     330.798356       1.125674    5.343856e+06
min         1.00000       1.000000       1.000000    8.747247e+08
25%       254.00000     175.000000       3.000000    8.794487e+08
50%       447.00000     322.000000       4.0

In [9]:
# Access data by row or range
users_df.iloc[:5]

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [10]:
# Mean age of all female programmers
users_df[(users_df.occupation == "programmer") & (users_df.sex == "F")].age.mean()

32.166666666666664

In [11]:
# 40 year-old males
users_df[(users_df.age == 40) & (users_df.sex == "M")]

Unnamed: 0,user_id,age,sex,occupation,zip_code
18,19,40,M,librarian,2138
82,83,40,M,other,44133
115,116,40,M,healthcare,97232
199,200,40,M,programmer,93402
283,284,40,M,executive,92629
289,290,40,M,engineer,93550
308,309,40,M,scientist,70802
357,358,40,M,educator,10022
397,398,40,M,other,60008
564,565,40,M,student,55422


In [12]:
# Split Apply Combine number of ratings per user
ratings_df.groupby("user_id").count().head()

# ratings_df.movie_id.groupby(ratings_df.user_id).count().head()

Unnamed: 0_level_0,movie_id,rating,unix_timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,272,272,272
2,62,62,62
3,54,54,54
4,24,24,24
5,175,175,175


In [13]:
# Average Rating Per Movie
ratings_df.rating.mean()

3.52986

In [14]:
# Get movie titles with highest average rating
# Left Join Ratings and Movies Dataframes based on movie_id, and select the title from the Movies Dataframe
pd.merge(ratings_df.sort_values(by="rating", ascending=False), movies_df[["movie_id", "title"]], how="left", on=["movie_id"]).head(15)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title
0,64,183,5,889737914,Alien (1979)
1,213,121,5,878870940,Independence Day (ID4) (1996)
2,286,707,5,877531975,Enchanted April (1991)
3,94,518,5,891720950,Miller's Crossing (1990)
4,108,10,5,879879834,Richard III (1995)
5,331,81,5,877196702,"Hudsucker Proxy, The (1994)"
6,363,172,5,891495711,"Empire Strikes Back, The (1980)"
7,617,192,5,883788900,Raging Bull (1980)
8,694,199,5,875728435,"Bridge on the River Kwai, The (1957)"
9,801,332,5,890332719,Kiss the Girls (1997)


In [15]:
# Determine if occupational groups have more Males than Females
users_df.groupby("occupation").apply(lambda user: sum(user.sex == "M") > sum(user.sex == "F"))

occupation
administrator     True
artist            True
doctor            True
educator          True
engineer          True
entertainment     True
executive         True
healthcare       False
homemaker        False
lawyer            True
librarian        False
marketing         True
none              True
other             True
programmer        True
retired           True
salesman          True
scientist         True
student           True
technician        True
writer            True
dtype: bool

In [16]:
# Average Rating Per User
ratings_df[["rating", "user_id"]].groupby("user_id").mean().head()

Unnamed: 0_level_0,rating
user_id,Unnamed: 1_level_1
1,3.610294
2,3.709677
3,2.796296
4,4.333333
5,2.874286
