<b>Imports</b>

In [121]:
#imports

import csv
import string
from math import *
from bs4 import BeautifulSoup
from collections import defaultdict
import urllib.request

<b>Files that we will be using</b>

In [106]:
link = "links.csv"
movie = "movies.csv"
ratings = "ratings.csv"
#Punctuation List, used to filter out Parentheses
exclude = set(string.punctuation)
startRange = 2005 #Starting year of interest, inclusive
endRange = 2008 #Ending year of interest, not inclusive

In [107]:
file1 = open(link, 'r')
file2 = open(movie, 'r')

csvfile1 = csv.reader(file1, delimiter=',')
csvfile2 = csv.reader(file2, delimiter=',')

<b>Data Cleaning</b>

We want to limit the number of movies we go through, so we clean the given data by removing movies that aired outside of our specified range.

Fortunately, the year is provided for us in the title itself (lucky us), so we can use split() to splice the title into a list of strings, and then clean and format the year (always the last element). 

We can then compare the year to our start and end ranges, and if it is acceptable, then we add the ID to a list of accepted ID's

In [108]:
def cleanYear(yearString, punctSet):
    newStr = ''.join(ch for ch in yearString if ch not in punctSet)
    return newStr

In [109]:
idList = {} #Dictionary to store relevant movie IDs
next(csvfile2) #skip the first line
for row in csvfile2:
    titleList = row[1].split()
    titleLen = len(titleList)
    try: 
        year = int(cleanYear(titleList[titleLen - 1], exclude))
        if year in range(startRange, endRange):
            idList[row[0]] = year
    except ValueError:
        pass




920


In [110]:
#Dictionaries to store year and listof movie IDs
#Each Content Rating gets its own dictionary
gRating = defaultdict(list)
pgRating = defaultdict(list)
pg13Rating = defaultdict(list)
rRating = defaultdict(list)
nc17Rating = defaultdict(list)

next(csvfile1)
for row in csvfile1:
    if row[0] in idList:
        year = idList[row[0]] #Find the year to use as the key
        id = row[1] #Pull the imdb movie ID 
        link = "http://www.imdb.com/title/tt" + id + "/" #Generate the URL to the movie's webpage
        url = urllib.request.urlopen(link).read() 
        soup = BeautifulSoup(url, "html.parser") #Use BeautifulSoup to parse
        rating = soup.find('meta', itemprop="contentRating") #Find the content rating tag
        try:
            #Determine which dictionary the info goes in, and put in there
            if rating['content'] == 'G':
                gRating[year].append(row[0])
            elif rating['content'] == 'PG':
                pgRating[year].append(row[0])
            elif rating['content'] == 'PG-13':
                pg13Rating[year].append(row[0])
            elif rating['content'] == 'R':
                rRating[year].append(row[0])
            elif rating['content'] == 'NC-17':
                nc17Rating[year].append(row[0])
        except TypeError:
            pass







defaultdict(<class 'list'>, {2005: ['31429', '32289', '33830', '34072', '36397', '38038', '40339', '117368'], 2006: ['45074', '45517', '49013', '49647'], 2007: ['50872', '52287', '68522', '83219']})
defaultdict(<class 'list'>, {2005: ['30793', '31422', '31698', '31700', '32017', '32031', '33004', '33495', '33615', '33639', '33669', '33681', '33815', '34332', '35015', '37729', '37739', '37857', '39292', '39307', '39435', '40629', '40851', '40962', '41566', '42009', '42734'], 2006: ['42730', '43836', '43910', '43917', '44022', '44709', '45208', '45431', '45666', '45668', '45928', '45950', '46850', '46948', '46972', '47124', '47516', '47642', '47644', '48414', '48682', '48711', '48982', '49274', '49524', '49649', '49651', '49793', '50160', '51471', '51660', '56719', '57972', '58347'], 2007: ['50149', '50601', '50923', '51698', '51834', '51939', '52694', '52975', '53121', '53460', '53464', '53993', '54276', '54290', '55250', '55732', '55768', '55872', '56152', '56176', '56775', '56915', '5

In [114]:
#Calculates the average rating by pulling data from the movies whose IDs correspond with the list
#movieList
def calcAverage(movieList):
    file = open(ratings, 'r')
    csvfile = csv.reader(file, delimiter=',')
    ratingCount = 0
    ratingSum = 0
    next(csvfile) #skip first line
    for row in csvfile:
        if row[1] in movieList:
            ratingCount += 1
            ratingSum += float(row[2])
    avgRating = ratingSum / ratingCount
    file.close()
    return avgRating


In [117]:
#Helper Function: Calculates the mean in a list
def mean (x):
    return sum(a for a in x) / len(x)

#Helper Function: Calculates the square root of the sum of a - the mean
def pearson_rooted(x):
    return sqrt(sum([((a - mean(x)) * (a - mean(x))) for a in x]))

#Defines the Pearson Relation
def pearson(x, y):
    num = sum((a - mean(x)) * (b - mean(y)) for a, b in zip(x, y))
    denom = pearson_rooted(x) * pearson_rooted(y)
    if denom != 0:
        return float(num)/denom
    else:
        #Error Catching in case of zero division
        return 'Error: Division by Zero'

In [116]:
#List used to store results of average rating calculations
avgRatingG = []
avgRatingPG = []
avgRatingPG13 = []
avgRatingR = []
avgRatingNC17 = []

#List used to store results of movie count calculations
movieCountG = []
movieCountPG = []
movieCountPG13 = []
movieCountR = []
movieCountNC17 = []

#Calculate the average rating for each year in each Content Rating, append results to a list
#Calculate the number of movies produced each year for each Content Rating, append results to list
for i in range(startRange, endRange):
    movieCountG.append(len(gRating[i]))
    avgRatingG.append(calcAverage(gRating[i]))
    movieCountPG.append(len(pgRating[i]))
    avgRatingPG.append(calcAverage(pgRating[i]))
    movieCountPG13.append(len(pg13Rating[i]))
    avgRatingPG13.append(calcAverage(pg13Rating[i]))
    movieCountR.append(len(rRating[i]))
    avgRatingR.append(calcAverage(rRating[i]))
    movieCountNC17.append(len(nc17Rating[i]))
    avgRatingNC17.append(calcAverage(nc17Rating[i]))

Average Ratings:
[3.169642857142857, 3.3076923076923075, 3.6875]
[3.375, 3.1017964071856285, 3.164]
[3.296954314720812, 3.3239895697522814, 3.346547314578005]
[3.4216049382716047, 3.5983050847457627, 3.483065953654189]
[3.1666666666666665, 3.4, 3.5]
Number of movies (2005, 2006, 2007)
[8, 4, 4]
[27, 34, 28]
[90, 86, 85]
[114, 134, 152]
[1, 1, 2]


The next block calculates the Pearson Correlation between average ratings of movies based on Content Ratings, as well as the Pearson Correlation between movie counts of each content rating.

In [129]:
print ("Calculating Pearson Relation of Average Ratings...")
print ("Comparing G to PG, PG13, R, and NC17...")
print (pearson(avgRatingG, avgRatingPG))
print (pearson(avgRatingG, avgRatingPG13))
print (pearson(avgRatingG, avgRatingR))
print (pearson(avgRatingG, avgRatingNC17))

print ("Comparing PG to PG13, R, and NC17...")
print (pearson(avgRatingPG, avgRatingPG13))
print (pearson(avgRatingPG, avgRatingR))
print (pearson(avgRatingPG, avgRatingNC17))

print ("Comparing PG-13 to R, and NC17...")
print (pearson(avgRatingPG13, avgRatingR))
print (pearson(avgRatingPG13, avgRatingNC17))

print ("Comparing R to NC17...")
print (pearson(avgRatingR, avgRatingNC17))

print ()
print ("Calculating Pearson Relation of Movie Counts...")
print ("Comparing G to PG, PG13, R, and NC17...")
print (pearson(movieCountG, movieCountPG))
print (pearson(movieCountG, movieCountPG13))
print (pearson(movieCountG, movieCountR))
print (pearson(movieCountG, movieCountNC17))

print ("Comparing PG to PG13, R, and NC17...")
print (pearson(movieCountPG, movieCountPG13))
print (pearson(movieCountPG, movieCountR))
print (pearson(movieCountPG, movieCountNC17))

print ("Comparing PG-13 to R, and NC17...")
print (pearson(movieCountPG13, movieCountR))
print (pearson(movieCountPG13, movieCountNC17))

print ("Comparing R to NC17...")
print (pearson(movieCountR, movieCountNC17))


Calculating Pearson Relation of Average Ratings...
Comparing G to PG, PG13, R, and NC17...
-0.5353983357961796
0.9506863976443302
0.08627656687854691
0.8822213579364225
Comparing PG to PG13, R, and NC17...
-0.7709517072598727
-0.8876426897729918
-0.8700068288055155
Comparing PG-13 to R, and NC17...
0.3910192547182371
0.9847470642682293
Comparing R to NC17...
0.5451942635939833

Calculating Pearson Relation of Movie Counts...
Comparing G to PG, PG13, R, and NC17...
-0.6099942813304187
0.9819805060619655
-0.8808122718846413
-0.5000000000000001
Comparing PG to PG13, R, and NC17...
-0.4492518698068122
0.16211349747530154
-0.3812464258315117
Comparing PG-13 to R, and NC17...
-0.9544170819209595
-0.6546536707079771
Comparing R to NC17...
0.8504394349231019
