In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

%matplotlib inline

In [2]:
# Load the CSV dataset
df = pd.read_csv('../inputs/kickstarter_projects.csv',
                 parse_dates=['deadline', 'launched'])

In [3]:
# Drop live projects
df = df.query('state != "live"')

# Add outcome column, "successful" == 1, others are 0
df = df.assign(outcome=(df['state'] == 'successful').astype(int))
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,outcome
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:00,0.0,failed,0,GB,0.0,0.0,1533.95,0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:00,2421.0,failed,15,US,100.0,2421.0,30000.0,0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:00,220.0,failed,3,US,220.0,220.0,45000.0,0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:00,1.0,failed,1,US,1.0,1.0,5000.0,0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:00,1283.0,canceled,14,US,1283.0,1283.0,19500.0,0


In [4]:
# Feature Engineering
df = df.assign(hour=df.launched.dt.hour,
               day=df.launched.dt.day,
               month=df.launched.dt.month,
               year=df.launched.dt.year)

## Test the hypothesis: "There are certain times of year where pledge amounts are higher."

To prove or disprove the hypothesis that there are certain times of the year where pledge amounts are higher, we can perform a statistical analysis comparing the pledge amounts for different time periods. One way to do this is by using the ANOVA (Analysis of Variance) test to compare the mean pledge amounts between different months or seasons.

The null hypothesis for the ANOVA test is that the mean pledge amounts are equal for all time periods, and the alternative hypothesis is that at least one mean is different. If the p-value obtained from the ANOVA test is less than the significance level (alpha), we can reject the null hypothesis and conclude that there are significant differences in pledge amounts between different time periods.

In [5]:
import pandas as pd
import numpy as np
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Assuming 'df' is the DataFrame containing the Kickstarter data

# Feature Extraction
df['launched'] = pd.to_datetime(df['launched'])
df['deadline'] = pd.to_datetime(df['deadline'])
df['duration'] = (df['deadline'] - df['launched']).dt.days

# Extract month and convert to string
df['month'] = df['launched'].dt.month

# Group the data by 'month' and calculate the mean pledge amount for each group
grouped_data = df.groupby('month')['usd_pledged_real'].mean()

# Perform ANOVA to compare the mean pledge amounts for different months
# For simplicity, we'll consider the pledge amounts for each month as separate groups
months = df['month'].unique()
groups = [df[df['month'] == month]['usd_pledged_real'].values for month in months]
statistic, p_value = f_oneway(*groups)

# Define significance level (alpha)
alpha = 0.05

# Check if p-value is less than alpha to determine statistical significance
if p_value < alpha:
    print("The mean pledge amounts for different months are significantly different.")
    print(f"ANOVA p-value: {p_value:.6f}")

    # Perform post-hoc analysis using Tukey's HSD test
    tukey_results = pairwise_tukeyhsd(df['usd_pledged_real'], df['month'], alpha=alpha)

    # Get the summary table of the Tukey's HSD test
    summary_table = tukey_results.summary()

    # Extract significant month pairs
    significant_pairs = []
    for row in summary_table.data[1:]:
        if row[-1] < alpha:
            significant_pairs.append(row[:2])

    # Find the month with the highest and lowest average pledge amounts
    max_month = grouped_data.idxmax()
    max_average_pledge = grouped_data.max()

    min_month = grouped_data.idxmin()
    min_average_pledge = grouped_data.min()

    print(f"The month with the highest average pledge amount: Month {max_month}, Mean Pledge Amount: {max_average_pledge:.2f}")
    print(f"The month with the lowest average pledge amount: Month {min_month}, Mean Pledge Amount: {min_average_pledge:.2f}")

else:
    print("The mean pledge amounts for different months are not significantly different.")
    print(f"ANOVA p-value: {p_value:.6f}")


The mean pledge amounts for different months are significantly different.
ANOVA p-value: 0.000000
The month with the highest average pledge amount: Month 9, Mean Pledge Amount: 10353.75
The month with the lowest average pledge amount: Month 12, Mean Pledge Amount: 6049.86
