In [75]:
import pandas as pd
df_rt_info = pd.read_csv('zippedData/rt.movie_info.tsv.gz', sep='\t', compression='gzip')
df_rt_rev = pd.read_csv('zippedData/rt.reviews.tsv.gz', sep='\t', compression='gzip', encoding='iso-8859-1')
df_rt_info.head(2)

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000.0,108 minutes,Entertainment One


Looking at these two dataframes, I want to join review data to the movie data, but this would be a many-to-one join and would make an extremely large and unworkable merge. Therefore, I want to create an indicator of average review for each movie so I can add columns to the rt_info dataframe. Aftwr previewing the 'rating' column, I can see that the scale of ratings is different in many instances, and some are numerical while some are not. Since it would be pretty messy to try and make this column workable, I am instead going to use the 'fresh' column to get a percentage of overall positive reviews. 

In [76]:
# insert a new column with a value of zero 
df_rt_rev['fresh_ind'] = 0

# loop through the data and input a 1 where the rating is 'fresh'
for row in df_rt_rev.index:
    if df_rt_rev['fresh'][row] == 'fresh':
        df_rt_rev['fresh_ind'][row] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [77]:
# pivot the data to look at aggregate values by movie: total ratings (count), and total positive ratings (sum)
df_grouped = df_rt_rev.groupby(['id'])['fresh_ind'].agg(['count', 'sum'])

#insert a column to get a % of total positive reviews
df_grouped['fresh_pct'] = df_grouped['sum'] / df_grouped['count']
df_grouped.reset_index()
df_grouped.head(2)

Unnamed: 0_level_0,count,sum,fresh_pct
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,163,103,0.631902
5,23,18,0.782609


In [78]:
rt_all = pd.merge(df_rt_info, df_grouped, how='inner', on='id')

In [79]:
#there are fewer movies in this df so I will expect it to return some nulls. Then clean up old movies
rt_all.head(2)

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio,count,sum,fresh_pct
0,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000.0,108 minutes,Entertainment One,163,103,0.631902
1,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",,,116 minutes,,23,18,0.782609


In [80]:
rt_all['theater_date'] = pd.to_datetime(rt_all['theater_date'])
rt_all['release_year'] = rt_all['theater_date'].dt.year
rt_all['release_year'] = rt_all['release_year'].fillna(0)

In [81]:
rt_all = rt_all.astype({'release_year': 'int64'})
rt_all.head()

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio,count,sum,fresh_pct,release_year
0,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,2012-08-17,"Jan 1, 2013",$,600000.0,108 minutes,Entertainment One,163,103,0.631902,2012
1,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,1996-09-13,"Apr 18, 2000",,,116 minutes,,23,18,0.782609,1996
2,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,1994-12-09,"Aug 27, 1997",,,128 minutes,,57,32,0.561404,1994
3,8,The year is 1942. As the Allies unite overseas...,PG,Drama|Kids and Family,Jay Russell,Gail Gilchriest,2000-03-03,"Jul 11, 2000",,,95 minutes,Warner Bros. Pictures,75,56,0.746667,2000
4,10,Some cast and crew from NBC's highly acclaimed...,PG-13,Comedy,Jake Kasdan,Mike White,2002-01-11,"Jun 18, 2002",$,41032915.0,82 minutes,Paramount Pictures,108,50,0.462963,2002


In [87]:
rt_filtered = rt_all[(rt_all['release_year'] >= 2010)]

In [None]:
#analyze reviews by genre, studio