In [35]:
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from collections import Counter

In [36]:
subs_df = pd.read_csv("data/filmfreeway-submissions.csv")

In [37]:
subs_df.info()
rows, cols = subs_df.shape
print(f'{rows} rows & {cols} columns.')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Project Title          103 non-null    object 
 1   Duration               103 non-null    object 
 2   Genres                 42 non-null     object 
 3   First Name             103 non-null    object 
 4   Last Name              103 non-null    object 
 5   Email                  103 non-null    object 
 6   City                   67 non-null     object 
 7   State                  65 non-null     object 
 8   Country                67 non-null     object 
 9   Submission Date        103 non-null    object 
 10  Submission Status      103 non-null    object 
 11  Submission Categories  103 non-null    object 
 12  Rating                 0 non-null      float64
 13  Submission ID          103 non-null    int64  
 14  Birthdate              83 non-null     object 
 15  Gender

In [38]:
print(set(x for x in subs_df['Submission Status']))

{'Withdrawn', 'Incomplete', 'In Consideration'}


## CLEAN DATA
Note: Should "Incomlete" entries be dropped?

In [39]:
# Drop withdrawn entries
subs_df = subs_df[subs_df['Submission Status'] != 'Withdrawn']

# duplicate_entries = subs_df[subs_df['Project Title'].duplicated()==True]

In [40]:
# Replace column to shorten: Feel Good Shorts
subs_df['Submission Categories'] = subs_df['Submission Categories'].apply( \
        lambda x: x.replace('Short films that make you feel good.', 'Feel Good Shorts'))

In [41]:
# 'Youth Short Films, Short Documentary' force to 'Short Documentary'
subs_df['Submission Categories'] = subs_df['Submission Categories'].apply( \
        lambda x: x.replace('Youth Short Films, Short Documentary', 'Short Documentary'))

In [42]:
# drop rating, genres cols due to all missing data
subs_df.drop(columns=['Rating', 'Genres'], inplace=True)

In [46]:
# fill na with empty str to allow apply method
subs_df['State'].fillna('', inplace=True)

## EDA

In [43]:
subs_df.columns

Index(['Project Title', 'Duration', 'First Name', 'Last Name', 'Email', 'City',
       'State', 'Country', 'Submission Date', 'Submission Status',
       'Submission Categories', 'Submission ID', 'Birthdate', 'Gender'],
      dtype='object')

In [44]:
gender_count = Counter(subs_df.Gender)
print(gender_count)

Counter({'Male': 59, 'Female': 26, nan: 13, 'Other': 2})


In [47]:
# replace to unify state names
subs_df['State'] = subs_df['State'].apply(
    lambda x: x.replace('maryland', 'MD')
               .replace('Maryland', 'MD')
               .replace('Virginia', 'VA')
               .replace('Georgia', 'GA')
               .replace('Pennsylvania', 'PA')
               .replace('Florida', 'FL')
               .replace('New York', 'NY')
               .replace('ny', 'NY')
)

In [67]:
state_count = Counter(subs_df.State)
us_state_count = [state for state in state_count.items() if len(state[0]) == 2]
international_count = [state for state in state_count.items() if len(state[0]) != 2 and state[0] != '']
unknown_loc_count = [state for state in state_count.items() if state[0] == '']
print('US COUNT:')
print(us_state_count)
print('\nINTERNATIONAL COUNT')
print(international_count)
print('\nUNKNOWN COUNT')
print(unknown_loc_count)

US COUNT:
[('VA', 9), ('MD', 20), ('CA', 3), ('GA', 1), ('DE', 1), ('NY', 3), ('WI', 1), ('PA', 3), ('DC', 1), ('FL', 2), ('NJ', 1), ('HI', 1), ('IL', 1), ('TX', 1), ('LA', 1), ('NC', 1), ('MA', 1), ('OH', 1)]

INTERNATIONAL COUNT
[('Cornwall and Isles of Scilly', 1), ('Binh Thuan', 1), ('Valladolid', 1), ('Málaga', 1), ('Ontario', 1), ('London', 1), ('kurdestan', 1), ('Luristan', 1), ('fars', 1), ('Rio Grande do Sul', 1), ('Maharashtra', 1), ('Buenos Aires', 1)]

UNKNOWN COUNT
[('', 36)]
