In [222]:
import os
import pandas as pd
import numpy as np

In [223]:
def get_movie_data():
    
    unames = ['user_id','gender','age','occupation','zip']
    users = pd.read_table(os.path.join('../data','users.dat'), 
                          sep='::', header=None, names=unames)
    
    rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
    ratings = pd.read_table(os.path.join('../data', 'ratings.dat'), 
                            sep='::', header=None, names=rnames)
    
    mnames = ['movie_id', 'title','genres']
    movies = pd.read_table(os.path.join('../data', 'movies.dat'), 
                           sep='::', header=None, names=mnames)

    return users, ratings, movies

In [224]:
users, ratings, movies = get_movie_data()



In [225]:
tmp = movies.title.str.match('(.*) \(([0-9]+)\)')
movies['year'] = tmp.map(lambda x: x[1] if len(x) > 0 else None)
movies['short_title'] = tmp.map(lambda x: x[0][:40] if len(x) > 0 else None)



In [226]:
movies_ratings = pd.merge(ratings, movies)
merged = pd.merge(movies_ratings,users)

# Part-1 Movie Data

All Joined data is grouped by movie title, so that aggregation on such groups based on rating can be calculated. Aggregated groups are sorted in descending order based on average rating, and it's head is taken. Head row is the movie having highest rating.

In [227]:
groups = movies_ratings.groupby('title').aggregate({'rating': [np.size,np.mean]})
limited = groups['rating']['size'] >= 200
groups[limited].sort([('rating', 'mean')], ascending=False).head(1)

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954),628,4.56051


There is atleast 200 minimum ratings to be considered for highly rated movie. Otherwise, a movie with single rating of 5 would make it a highly rated movie.
Hence, The most highly rated movie Movie Name: <b> Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)</b>

Let's get movies which atleast 200 people have given ratings.

In [228]:
top_200 = merged.groupby('movie_id').size().order(ascending=False).head(200)


Now, Pivot table is created with columns based on gender value. Additional column 'avg_rating' is added for average rating for male and female in total.

In [229]:
pivot = merged.pivot_table(index=['movie_id','title'],columns=['gender'],values='rating',fill_value=0)
pivot['avg_rating'] = (pivot.F + pivot.M)/2.0

In [230]:
pivot.reset_index('movie_id', inplace=True)
best_date_movies = pivot[pivot.movie_id.isin(top_200.index)]['avg_rating']

In [231]:
best_date_movies.order(ascending=False).head(1)

title
Shawshank Redemption, The (1994)    4.54985
Name: avg_rating, dtype: float64

Pivot table is filtered from top 200 (having at least 200 people's rating) movies. Thus obtained table is sorted and top most movie has highest rating by both genders. Hence, the best date-night movie is: <b> The Shawshank Redemption (1994) </b>

# Part-2 Titanic Data

In [233]:
from IPython.core.display import HTML
data = HTML(filename='../data/titanic.html')

In [234]:
t_file = pd.ExcelFile('../data/titanic.xls')
t_df = t_file.parse("titanic")

proportion of passengers that survived by sex.

In [235]:
total_females = t_df[t_df.sex == 'female'].shape[0]
total_males = t_df[t_df.sex == 'male'].shape[0]
grouped = t_df.groupby(['sex','survived'])
survived_females = grouped.get_group(('female',1)).shape[0]
survived_males = grouped.get_group(('male',1)).shape[0]
print 'Male Survival %% %f, Female Survival %% %f' %(survived_males * 100/float(total_males), survived_females * 100/float(total_females))

Male Survival % 19.098458, Female Survival % 72.746781


Females survived percentage is far more than Males' percentage.

Now, dataframe is grouped based on survival, class, and sex.

In [236]:
g = t_df.groupby(['survived','pclass', 'sex'])

Iterating through the groups. Groups are iterated by their group names.

In [237]:
for gender in pd.unique(t_df.sex.ravel()):
    for i in pd.unique(t_df.pclass.ravel()):
        survived = g.get_group((1,i,gender)).count()[0]
        killed = g.get_group((0,i,gender)).count()[0]
        print 'Class %d %s Survival %%: %f' % (i, gender, survived * 100 /float(survived + killed))


Class 1 female Survival %: 96.527778
Class 2 female Survival %: 88.679245
Class 3 female Survival %: 49.074074
Class 1 male Survival %: 34.078212
Class 2 male Survival %: 14.619883
Class 3 male Survival %: 15.212982


The t_agegroup column is added to represent different age groups given by labels. Pandas' cut function is used to create agegroups.

In [238]:
labels = ['children', 'adolescents', 'adult', 'senior']
t_df['t_agegroup'] = pd.cut(t_df.age, [0,14,20,64,150], right=False, labels=labels)
grouped = t_df.groupby(['survived','t_agegroup', 'pclass', 'sex'])

Iterate through each group by agegroup, gender, class and calculate the survival percentage.

In [239]:
for agegroup in pd.unique(t_df.t_agegroup.ravel())[:-1]: #last group removed, as its NaN and not relevent.
    for gender in pd.unique(t_df.sex.ravel()):
        for class_ in pd.unique(t_df.pclass.ravel()):
            survived_key = (1,agegroup, class_,gender)
            if survived_key not in grouped.groups:
                survived = 0
            else:
                survived = grouped.get_group(survived_key).count()[0]
            
            killed_key = (0,agegroup, class_,gender)
            if killed_key not in grouped.groups:
                killed = 0
            else:
                killed = grouped.get_group(killed_key).count()[0]
            if killed==survived==0:
                continue
            else:
                percentage = survived * 100 /float(survived + killed)
            print 'The %s %s(s) in Class %d, Survival %%: %f' % ( gender,agegroup,class_, percentage)

The female adult(s) in Class 1, Survival %: 96.491228
The female adult(s) in Class 2, Survival %: 87.341772
The female adult(s) in Class 3, Survival %: 42.696629
The male adult(s) in Class 1, Survival %: 35.114504
The male adult(s) in Class 2, Survival %: 8.527132
The male adult(s) in Class 3, Survival %: 16.475096
The female children(s) in Class 1, Survival %: 0.000000
The female children(s) in Class 2, Survival %: 100.000000
The female children(s) in Class 3, Survival %: 48.387097
The male children(s) in Class 1, Survival %: 100.000000
The male children(s) in Class 2, Survival %: 100.000000
The male children(s) in Class 3, Survival %: 32.432432
The female senior(s) in Class 1, Survival %: 100.000000
The male senior(s) in Class 1, Survival %: 10.000000
The male senior(s) in Class 2, Survival %: 0.000000
The male senior(s) in Class 3, Survival %: 0.000000
The female adolescents(s) in Class 1, Survival %: 100.000000
The female adolescents(s) in Class 2, Survival %: 90.000000
The female 

In overall, Females have greater survival percentage compared to males in all age groups and/or classes. Children have more percentage of survival compared to males(in general), but not as much as females. This shows <b> womens and children were taken care at first.</b>