In [1]:
import csv
import pandas as pd
import statistics as stats
import re

In [None]:
with open("plot_summaries.txt") as f:
    summaries = f.readlines()

In [3]:
print(type(summaries[0]))

<class 'str'>


In [5]:
summary_word_lengths = [len(text.split()) for text in summaries]

In [6]:
def get_averages(text):
    mean = stats.mean(text)
    median = stats.median(text)
    mode = stats.mode(text)
    grouped = stats.median_grouped(text)
    return {'mean': mean, 'median' : median, 'mode' : mode, 'median grouped' : grouped}

In [7]:
averages = get_averages(summary_word_lengths)
averages

{'mean': 311.7183614617312,
 'median': 188.0,
 'median grouped': 187.68072289156626,
 'mode': 66}

In [8]:
text = []
counts = []
for i in range(len(summaries)):
    if 300 < summary_word_lengths[i] < 400:
        text.append(summaries[i])
        counts.append(summary_word_lengths[i])
        
get_averages(counts)    

{'mean': 348.2699631449631,
 'median': 347.0,
 'median grouped': 347.1774193548387,
 'mode': 327}

In [6]:
## read in original metadata
char_data = pd.DataFrame.from_csv('character.metadata.tsv', sep='\t', header = None, index_col = None)
film_data = pd.DataFrame.from_csv('movie.metadata.tsv', sep = '\t', header = None, index_col = None)

In [None]:
char_data.rename(columns = {0:"film_id"})
char_data[char_data.film_id.isin([9031450])]

In [31]:
## drop irrelevant columns and rows with NaN
char_data = char_data.drop([1,4,6,7,10,11,12], axis = 1)
char_data = char_data.dropna()
len(char_data.index)

133378

In [58]:
film_data = film_data.drop([1,4,5,8], axis = 1)
film_data = film_data.dropna()
len(film_data.index)

74839

In [32]:
## function to convert film release dates to year only
def get_film_year(string):
    try:
        year = re.search(r"\d{4}",string)
        return year[0]
    except TypeError:
        pass

In [33]:
## relabel columns with more useful mames
char_data.rename(columns = {0:"film_id", 2:"film_release_date", 3:"char_name",5:"actor_gender",8:"actor_name",9:"actor_age"}, inplace = True)

In [None]:
## convert film release dates to years
char_data["film_release_date"] = char_data["film_release_date"].apply(get_film_year)
char_data

In [35]:
char_data = char_data.sort_values(by = "film_release_date")
len(char_data.index)

133378

In [36]:
## delete rows where actor's age is negative because that can't be right
char_data = char_data[char_data.actor_age > 0]
char_data

Unnamed: 0,film_id,film_release_date,char_name,actor_gender,actor_name,actor_age
132703,5954041,1908,Dorothy Gale,F,Romola Remus,8.0
280213,73413,1909,Farmer's Little Daughter,F,Gladys Egan,4.0
26342,6735825,1909,The Catholic's Child,F,Edith Haldeman,4.0
383795,4210812,1910,Dorothy Gale,F,Bebe Daniels,9.0
47438,7253379,1911,"George IV, The Child King",F,Marie Eline,9.0
180359,7253646,1911,The Little Daughter,F,Marie Eline,9.0
445290,7253406,1911,The Youngster,F,Marie Eline,9.0
387696,7257285,1912,The Poor Couple's Daughter,F,Marie Eline,10.0
266245,24463054,1912,Little Trixie Thompson,F,Magda Foy,6.0
326127,7259226,1912,The Child,F,Helen Badgley,3.0


In [37]:
## save clean data to csv
char_data.to_csv("char_data_clean.csv")