In [None]:
# Libraries

import os, json, re
import pandas as pd 
from sklearn.preprocessing import LabelEncoder



######### functions for pre-processing ####################################################################

def extract_year_date_month(df, column):
    '''Takes a column, converts it to datetime, and creates new columns with day, month and year
    The new columns are named:
        - column_weekday
        - column_month
        - column_year
    '''
    
    # Convert column in df to datetime
    df[column] = pd.to_datetime(df[column], unit='s')

    # extract the day, month, and year components
    df[column + '_' + 'weekday'] = df[column].dt.weekday
    df[column + '_' + 'month'] = df[column].dt.month
    #df[column + '_' + 'year'] = df[column].dt.year

    return df


def duration(df, column1, column2):
    '''Returns the duration in days between 2 columns with datetime and puts it into a new colum
        - column1: start date
        - column2: end date
    '''
    df['duration_days'] = (df[column2] - df[column1]).dt.days

    return df

def convert_to_usd(df):
    return round(df['goal'] * df['static_usd_rate'],2)




######### Load the data ########################################################## 

directory = 'Kickstarter_data/'
data = pd.DataFrame()
relevant_columns = ['category', 'country', 'state', 'static_usd_rate', 'goal', 'launched_at', 'deadline', 'creator']

for file in sorted(os.listdir(directory)):
    df_temp = pd.read_csv(directory+file)
    data = pd.concat([data, df_temp[relevant_columns]], ignore_index=True)


##### Cleaning ########################################

## drop duplicates
    
data = data.drop_duplicates(ignore_index =True)   
data = data.drop('creator', axis=1)    


## Get the categories from the 'category

cat_data = data["category"].apply(json.loads)
cat_data = pd.DataFrame(cat_data.tolist())
data['slug'] = cat_data['slug']
data = data.drop("category", axis=1)
data["slug"] = data["slug"].apply(lambda x: re.split(r'/', x)[0])

le = LabelEncoder()
data['slug'] = le.fit_transform(data['slug'])

## Work on the 'state' column

data = data[data['state'] != 'live'].reset_index(drop=True)
data['state'] = data['state'].apply(lambda x: 1 if x == 'successful' else 0)


## Work on the time-related columns 'launched_at' and 'deadline'

data['launched_at'] = pd.to_datetime(data['launched_at'], unit='s')
data['deadline'] = pd.to_datetime(data['deadline'], unit='s')

data = extract_year_date_month(data, 'launched_at')
data = duration(data, 'launched_at', 'deadline')

data = data.drop(['launched_at', 'deadline'], axis=1)

## Work on 'goal' column 

data['goal_in_usd'] = data.apply(convert_to_usd, axis=1)
data = data.drop(['static_usd_rate', 'goal'], axis=1)
data = data[data['goal_in_usd'] < 1000000]

## Work on 'country' column

data["north_america"] = data["country"].apply(lambda x: 1 if x in ['US', 'CA'] else 0)
data = data.drop('country', axis=1)



#save Dataframe in csv_file
#data.to_csv('data/cleaned_data.csv', index=False)

In [None]:
data

In [None]:
# LOGISTICS REGRESSION
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

In [None]:
# Define target and features
#X = data.drop('state',axis=1)
X = data.drop("state", axis =1)
y = data["state"]

#-------------------------------------------------------------------------------

# Train-test-split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Normalize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Modelling
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train_scaled, y_train)
y_pred = logistic_regression.predict(X_test_scaled)

# Confusion matrix using pandas crosstab
conf_matrix= confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True);
print('Accuracy: ', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
loaded_data = pd.read_csv('data/cleaned_data.csv')
loaded_data