In [None]:
# authenticate and connect to google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import json
import seaborn as sns
from datetime import datetime

In [None]:
# Duci path in google drive: /content/drive/MyDrive/ML_in_applied_settings/dataset/Kickstarter000.csv

# create empty data frame with desired columns
kickstarter_df = pd.DataFrame(columns = ['blurb', 'category', 'country', 'created_at', 'creator', 'currency', 'deadline', 'goal', 'id', 'launched_at', 'location', 'name', 'photo', 'profile', 'state', 'state_changed_at', 'urls'])

# for each of the csv files downloaded, select only where project has finished and the desired columns, then merge
for i in range(0,6):
  csv_path = "/content/drive/MyDrive/ML_in_applied_settings/dataset/Kickstarter00{}.csv".format(i)
  df = pd.read_csv(csv_path)
  df = df[df['state'].isin(['successful', 'failed'])]
  df = df[['blurb', 'category', 'country', 'created_at', 'creator', 'currency', 'deadline', 'goal', 'id', 'launched_at', 'location', 'name', 'photo', 'profile', 'state', 'state_changed_at', 'urls']]
  kickstarter_df = pd.concat([kickstarter_df, df], axis = 0, ignore_index = True)

kickstarter_df = kickstarter_df[kickstarter_df['location'].notna()]
kickstarter_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17656 entries, 0 to 17669
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   blurb             17656 non-null  object
 1   category          17656 non-null  object
 2   country           17656 non-null  object
 3   created_at        17656 non-null  object
 4   creator           17656 non-null  object
 5   currency          17656 non-null  object
 6   deadline          17656 non-null  object
 7   goal              17656 non-null  object
 8   id                17656 non-null  object
 9   launched_at       17656 non-null  object
 10  location          17656 non-null  object
 11  name              17656 non-null  object
 12  photo             17656 non-null  object
 13  profile           17656 non-null  object
 14  state             17656 non-null  object
 15  state_changed_at  17656 non-null  object
 16  urls              17656 non-null  object
dtypes: object(17

Preprocessing of data set

In [None]:
# extract the features from the json objects

kickstarter_df['parent_category'] = kickstarter_df['category'].apply(lambda x: json.loads(x)['parent_name'] if "parent" in x else json.loads(x)['name']) # few categories do not have a parent category and are therefore set to sub_category
kickstarter_df['sub_category'] = kickstarter_df['category'].apply(lambda x: json.loads(x)["name"])
# kickstarter_df['company'] = kickstarter_df['creator'].apply(lambda x: json.loads(x)["name"] if "company" in x else None) yields just 15 results
kickstarter_df['country'] = kickstarter_df['location'].apply(lambda x: json.loads(x)["expanded_country"] if "expanded_country" in x else None)
kickstarter_df['city'] = kickstarter_df['location'].apply(lambda x: json.loads(x)["name"] if "name" in x else None)
kickstarter_df['project_url'] = kickstarter_df['urls'].apply(lambda x: json.loads(x)["web"]["project"] if "project" in x else None)
kickstarter_df['reward_url'] = kickstarter_df['urls'].apply(lambda x: json.loads(x)["web"]["rewards"] if "rewards" in x else None)
kickstarter_df['photo_url'] = kickstarter_df['photo'].apply(lambda x: json.loads(x)["full"] if "full" in x else None)
kickstarter_df['create_to_launch'] = kickstarter_df['launched_at'] - kickstarter_df['created_at']
kickstarter_df['create_to_deadline'] = kickstarter_df['deadline'] - kickstarter_df['created_at']
kickstarter_df['launch_to_deadline'] = kickstarter_df['deadline'] - kickstarter_df['launched_at']
kickstarter_df['create_to_launch'] = kickstarter_df['create_to_launch'].apply(lambda x: round(x/86400, 1))
kickstarter_df['create_to_deadline'] = kickstarter_df['create_to_deadline'].apply(lambda x: round(x/86400, 1))
kickstarter_df['launch_to_deadline'] = kickstarter_df['launch_to_deadline'].apply(lambda x: round(x/86400, 1))
kickstarter_df['creation_date'] = kickstarter_df['created_at'].apply(lambda x: datetime.fromtimestamp(x))

clean_df = kickstarter_df[['id', 'name', 'blurb', 'state', 'goal', 'parent_category', 'sub_category', 'country', 'city', 'currency', 'project_url', 'reward_url', 'photo_url', 'creation_date', 'create_to_launch', 'create_to_deadline', 'launch_to_deadline']].copy()
clean_df = clean_df.drop_duplicates(keep='first')
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15523 entries, 0 to 17669
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   id                  15523 non-null  object        
 1   name                15523 non-null  object        
 2   blurb               15523 non-null  object        
 3   state               15523 non-null  object        
 4   goal                15523 non-null  object        
 5   parent_category     15523 non-null  object        
 6   sub_category        15523 non-null  object        
 7   country             15523 non-null  object        
 8   city                15523 non-null  object        
 9   currency            15523 non-null  object        
 10  project_url         15523 non-null  object        
 11  reward_url          15523 non-null  object        
 12  photo_url           15523 non-null  object        
 13  creation_date       15523 non-null  datetime64

In [None]:
filename = 'kickstarter_clean.csv'

clean_df.to_csv('/content/drive/MyDrive/ML_in_applied_settings/dataset/' + filename, index = False)

Merging the face detection variable with the Kickstarter data set

In [None]:
csv_path = "/content/drive/MyDrive/ML_in_applied_settings/dataset/retinaface_face_detection.csv"
retinaface = pd.read_csv(csv_path)
csv_path = "/content/drive/MyDrive/ML_in_applied_settings/dataset/opencv_face_detection.csv"
opencv = pd.read_csv(csv_path)

retinaface = retinaface[['id', 'is_face']]
opencv = opencv[['id', 'is_face']]

In [None]:
kickstarter_face_detection = pd.merge(clean_df, retinaface, on="id", how="inner")
kickstarter_face_detection = pd.merge(kickstarter_face_detection, opencv, on="id", how="inner")
kickstarter_face_detection.columns = ['id', 'name', 'blurb', 'state', 'goal', 'parent_category',
       'sub_category', 'country', 'city', 'currency', 'project_url',
       'reward_url', 'photo_url', 'creation_date', 'create_to_launch',
       'create_to_deadline', 'launch_to_deadline', 'is_face_retinaface', 'is_face_opencv']


In [None]:
filename = 'kickstarter_face_detection.csv'
kickstarter_face_detection.to_csv('/content/drive/MyDrive/ML_in_applied_settings/dataset/' + filename, index = False)