<a href="https://colab.research.google.com/github/ekaterinagolowatenko/crowdfunding_baseline/blob/main/crowdfunding_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [None]:
uploaded = files.upload()
df = pd.read_csv('ks-projects-201801.csv')
df.columns.tolist()

Saving ks-projects-201801.csv to ks-projects-201801.csv


['ID',
 'name',
 'category',
 'main_category',
 'currency',
 'deadline',
 'goal',
 'launched',
 'pledged',
 'state',
 'backers',
 'country',
 'usd pledged',
 'usd_pledged_real',
 'usd_goal_real']

In [None]:
df.head()
df = df[df['state'].isin(['successful', 'failed'])].copy()
df['success'] = (df['state'] == 'successful').astype(int)
df_clean = df[df['state'].isin(['successful', 'failed'])].copy()
df_clean['success'] = (df_clean['state'] == 'successful').astype(int)
df_clean['launched'] = pd.to_datetime(df_clean['launched'])
df_clean['deadline'] = pd.to_datetime(df_clean['deadline'])
df_clean['duration_days'] = (df_clean['deadline'] - df_clean['launched']).dt.days
df_clean = df_clean[
    (df_clean['duration_days'] > 0) &
    (df_clean['usd_goal_real'] > 0)
].copy()
df_clean['name_length'] = df_clean['name'].fillna('').str.len()
top_categories = df['category'].value_counts().head(20).index
df['category'] = df['category'].apply(lambda x: x if x in top_categories else 'Other')
print("Уникальных категорий после сокращения:", df['category'].nunique())

Уникальных категорий после сокращения: 21


In [None]:
features = [
    'usd_goal_real',
    'duration_days',
    'category',
    'main_category',
    'country',
    'currency'
]
X = df_clean[features]
y = df_clean['success']
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [None]:
allowed_categories = sorted(X_train['category'].unique())
allowed_main_categories = sorted(X_train['main_category'].unique())
allowed_countries = sorted(X_train['country'].unique())
print("Train size:", len(X_train))
print("Test size:", len(X_test))
print("Success rate in train:", y_train.mean())
print("Success rate in test:", y_test.mean())

Train size: 265266
Test size: 66317
Success rate in train: 0.40388515678601855
Success rate in test: 0.4038783419032827


In [None]:
categorical_features = ['category', 'main_category', 'country', 'currency']
numerical_features = ['usd_goal_real', 'duration_days']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
        ('num', StandardScaler(), numerical_features)
    ]
)

In [None]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

X_train_processed = X_train_processed.toarray() if hasattr(X_train_processed, "toarray") else X_train_processed
X_test_processed = X_test_processed.toarray() if hasattr(X_test_processed, "toarray") else X_test_processed

y_train = y_train.values
y_test = y_test.values

In [None]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_processed.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.fit(
    X_train_processed, y_train,
    epochs=30,
    batch_size=64,
    validation_data=(X_test_processed, y_test),
    verbose=1
)

y_pred_proba = model.predict(X_test_processed)
y_pred = (y_pred_proba > 0.5).astype(int).flatten()

print("F1-score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m4145/4145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3ms/step - accuracy: 0.6474 - loss: 0.6245 - val_accuracy: 0.6667 - val_loss: 0.6071
Epoch 2/30
[1m4145/4145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - accuracy: 0.6657 - loss: 0.6062 - val_accuracy: 0.6684 - val_loss: 0.6034
Epoch 3/30
[1m4145/4145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.6697 - loss: 0.6026 - val_accuracy: 0.6702 - val_loss: 0.6005
Epoch 4/30
[1m4145/4145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - accuracy: 0.6683 - loss: 0.6015 - val_accuracy: 0.6691 - val_loss: 0.5992
Epoch 5/30
[1m4145/4145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.6706 - loss: 0.5992 - val_accuracy: 0.6693 - val_loss: 0.5989
Epoch 6/30
[1m4145/4145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - accuracy: 0.6685 - loss: 0.5988 - val_accuracy: 0.6725 - val_loss: 0.5960
Epoch 7/30
[

In [None]:
def predict_success(usd_goal, duration, category, main_category, country, currency):
    input_df = pd.DataFrame([{
        'usd_goal_real': usd_goal,
        'duration_days': duration,
        'category': category,
        'main_category': main_category,
        'country': country,
        'currency': currency
    }])
    input_processed = preprocessor.transform(input_df)
    input_processed = input_processed.toarray() if hasattr(input_processed, "toarray") else input_processed
    prob = model.predict(input_processed)[0][0]
    return f"Вероятность успеха: {prob:.1%}"

In [None]:
def predict_success_interactive():
    category = input("Категория: ")
    while category not in allowed_categories:
        category = input("Выберите из списка: ")
    main_category = input("Основная категория: ")
    while main_category not in allowed_main_categories:
        main_category = input("Выберите из списка: ")
    country = input("Страна: ")
    while country not in allowed_countries:
        country = input("Выберите из списка: ")
    usd_goal = float(input("Цель в USD: "))
    duration = int(input("Длительность (дней): "))
    input_df = pd.DataFrame([{
        'usd_goal_real': usd_goal,
        'duration_days': duration,
        'category': category,
        'main_category': main_category,
        'country': country,
        'currency': 'USD'
    }])
    input_processed = preprocessor.transform(input_df)
    input_processed = input_processed.toarray() if hasattr(input_processed, "toarray") else input_processed
    prob = model.predict(input_processed)[0][0]
    return f"Вероятность успеха: {prob:.1%}"

In [None]:
print(predict_success_interactive())

Категория: Technology
Основная категория: Tech
Выберите из списка: Art
Страна: US
Цель в USD: 10000
Длительность (дней): 40
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Вероятность успеха: 33.5%
