### Disclaimer: This notebook contains functions to load the data from the individual csv tables and further perform data cleaning. It is ment to be executed once. The final dataframe is saved as a .csv file, which can then be worked on with regard to machine learning-based classification 

# Kickstarter Project

### Definition of relevant columns

* backers_count: amount of people pledging money to the project                                     
* category -> 'slug': name of the projects' specific parent- & sub-category (part of json string)
* country: country of the projects creator 
* creator -> 'id': id of the creator -> to be used as categorical variable (part of json string)
* goal: information on the amount of money needed to succeed in the local currency of the project
* launched_at: start date? of the project ()
* deadline: end date of the project ()
* spotlight: project highlighted on the website
* staff_pick: marked by a staff member of kickstarter (more attention drawn towards project)
* state: (successful/failed/canceled/live/suspended) -> exclude 'live' and combine 'canceled', 'suspended' with 'failed'
* static_usd_rate: exchange rate to transform goal in every column from current currency to USD



### Stakeholder: Project creator 
### Question: Is it useful to put much effort into launching a campaign on kickstarter? 
### Measure: Is the campaign likely to succeed or fail?

## Import Libraries

In [None]:
# Libraries

import os, json, re
import pandas as pd 



## Important Functions

In [None]:
######### functions for pre-processing ####################################################################

def extract_year_date_month(df, column):
    '''Takes a column, converts it to datetime, and creates new columns with day, month and year
    The new columns are named:
        - column_weekday
        - column_month
        - column_year
    '''
    
    # Convert column in df to datetime
    df[column] = pd.to_datetime(df[column], unit='s')

    # extract the day, month, and year components
    df[column + '_' + 'weekday'] = df[column].dt.weekday
    df[column + '_' + 'month'] = df[column].dt.month
    #df[column + '_' + 'year'] = df[column].dt.year

    return df


def duration(df, column1, column2):
    '''Returns the duration in days between 2 columns with datetime and puts it into a new colum
        - column1: start date
        - column2: end date
    '''
    df['duration_days'] = (df[column2] - df[column1]).dt.days

    return df

def convert_to_usd(df):
    return round(df['goal'] * df['static_usd_rate'],2)

######### functions for analysing predictions ########################################################## 



## Load data into one dataframe

In [None]:
directory = 'Kickstarter_data/'
data = pd.DataFrame()
relevant_columns = ['category', 'country', 'creator', 'state', 'static_usd_rate', 'goal', 'launched_at', 'deadline']

for file in sorted(os.listdir(directory)):
    df_temp = pd.read_csv(directory+file)
    data = pd.concat([data, df_temp[relevant_columns]], ignore_index=True)

data.head()

In [None]:
data.info()

In [None]:
data = data.drop_duplicates(ignore_index =True)

In [None]:
data.info()

## Work on the json string columns

### Extract the 'slug' parameter from the category column and drop the category column

In [None]:
cat_data = data["category"].apply(json.loads)
cat_data = pd.DataFrame(cat_data.tolist())
data['slug'] = cat_data['slug']
data = data.drop("category", axis=1)

### Extract the ID from the creator column and drop the creator column

In [None]:
data["creator_id"] = data["creator"].apply(lambda x: re.findall(r'\d+', x)[0])
data = data.drop("creator", axis=1)


### Exclude rows that have the state "live" 
#### we can't use them because we don't know wether the campaigns will succeed or fail

In [None]:
data = data[data['state'] != 'live'].reset_index(drop=True)

### Assign 1 to state == 'successful' and 0 to 'failed', 'canceled' or 'suspended'

In [None]:
data['state'] = data['state'].apply(lambda x: 1 if x == 'successful' else 0)

data['state'].value_counts()

## Work on the datetime columns

### Convert date-data to type date.time()

In [None]:
data['launched_at'] = pd.to_datetime(data['launched_at'], unit='s')
data['deadline'] = pd.to_datetime(data['deadline'], unit='s')

## Work on creator_id column 
### Create a new array, indicating wether a creator had a successful campaign before. 

In [None]:
# data.head()


In [None]:
# creators = data.creator_id.value_counts().to_frame().reset_index()
# multi_creators = creators[creators['count'] > 1]
# multi_creators

### For now: Drop the column

In [None]:
data = data.drop('creator_id', axis =1)

### Extract weekday and month of kickstarter project launch, as well as the duration of the kickstarter project and drop the "launched_at" and "deadline" column

In [None]:
data = extract_year_date_month(data, 'launched_at')
data = duration(data, 'launched_at', 'deadline')

data = data.drop(['launched_at', 'deadline'], axis=1)

### Convert unit of "goal" to USD and drop "static_usd_rate" and "goal" column

In [None]:
data['goal_in_usd'] = data.apply(convert_to_usd, axis=1)
data = data.drop(['static_usd_rate', 'goal'], axis=1)

In [None]:
data = data[data['goal_in_usd'] < 1000000]

In [None]:
data.info()

### Check balance

In [None]:
data.state.value_counts()


#### Classes:  % succeeded,  % failed 

## Pre-Processing

### Country to north america True/False

In [None]:
data["north_america"] = data["country"].apply(lambda x: 1 if x in ['US', 'CA'] else 0)

In [None]:
data.north_america.value_counts()

In [None]:
data.info()

In [None]:
data

In [None]:
data = data.drop('country', axis=1)

In [None]:
data

In [None]:
data.slug.value_counts()

In [None]:
data["slug"] = data["slug"].apply(lambda x: re.split(r'/', x)[0])

In [None]:
data

In [None]:
data.slug.value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

data['slug'] = le.fit_transform(data['slug'])


In [None]:
data

In [None]:
import seaborn as sns

In [None]:
sns.pairplot(data, hue='state')

In [None]:
oh_data = pd.get_dummies(data, columns=['slug', 'launched_at_weekday', 'launched_at_month'], drop_first=True)
oh_data

In [None]:
data.to_csv('cleaned_data.csv', index=False)

In [None]:
data.goal_in_usd.unique()

In [None]:
data.backers_count.unique()

In [None]:
data[['spotlight', 'state']]

In [None]:
data[data['spotlight'] != data['state']]

In [None]:
data[data['staff_pick'] != data['state']]

In [None]:
data = data.drop('spotlight', axis =1)

In [None]:
data = data.drop('staff_pick', axis=1)

In [None]:
data

In [None]:
data = data[data['goal_in_usd'] < 1000000]

In [None]:
sns.pairplot(data,hue='state')

In [None]:
import seaborn as sns

## Baseline Model

In [None]:
# LOGISTICS REGRESSION
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

In [None]:
# Define target and features
#X = data.drop('state',axis=1)
X = data.drop("state", axis =1)
y = data["state"]

#-------------------------------------------------------------------------------

# Train-test-split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Normalize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Modelling
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train_scaled, y_train)
y_pred = logistic_regression.predict(X_test_scaled)

# Confusion matrix using pandas crosstab
conf_matrix= confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True);
print('Accuracy: ', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# Define target and features
#X = data.drop('state',axis=1)
X = oh_data.drop("state", axis =1)
y = oh_data["state"]

#-------------------------------------------------------------------------------

# Train-test-split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Normalize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Modelling
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train_scaled, y_train)
y_pred = logistic_regression.predict(X_test_scaled)

# Confusion matrix using pandas crosstab
conf_matrix= confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True);
print('Accuracy: ', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))