In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import LinearSVC

In [2]:
df = pd.read_csv("ks-projects-201801.csv", index_col="ID")

In [3]:
df.head()

Unnamed: 0_level_0,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


# 前処理

In [4]:
df = df.drop(columns=['pledged', 'backers', 'usd pledged', 'usd_pledged_real', 'goal'])

In [5]:
# 日時に関する変数を TimeStamp に変換
df['deadline'] = pd.to_datetime(df['deadline'])
df['launched'] = pd.to_datetime(df['launched'])
# 期間の変数を作成
df['period'] = (df['deadline'] - df['launched']).dt.days

In [6]:
# 異常値の削除
df = df[df['period'] < 10000] # 異常値の除去
# 欠損値の削除
df = df.dropna(axis=0)

df = df[(df['state'] == 'successful') | (df['state'] == 'failed')]

In [7]:
df['n_words'] = df['name'].apply(lambda x: len(str(x).split(' ')))

In [8]:
df = df.drop(columns=['name', 'deadline', 'launched', 'country','category'])

In [9]:
X = df.drop(["state"], axis=1)
df["state"] = [1 if s == "successful" else 0 for s in df["state"]]
y = df["state"]

In [10]:
X = pd.get_dummies(X, drop_first=True)

## usd_goal_realを標準化

In [11]:
stdsc = StandardScaler()
df["usd_goal_real"] = stdsc.fit_transform(df[["usd_goal_real"]].values)

stdsc = StandardScaler()
df["period"] = stdsc.fit_transform(df[["period"]].values)

stdsc = StandardScaler()
df["n_words"] = stdsc.fit_transform(df[["n_words"]].values)


In [12]:
df

Unnamed: 0_level_0,main_category,currency,state,usd_goal_real,period,n_words
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1000002330,Publishing,GBP,0,-0.036049,1.969999,0.123984
1000003930,Film & Video,USD,0,-0.010379,2.048657,0.842204
1000004038,Film & Video,USD,0,0.003147,0.868787,-0.953345
1000007540,Music,USD,0,-0.032923,-0.311084,0.483094
1000014025,Food,USD,1,0.007656,0.082206,-0.953345
...,...,...,...,...,...,...
999975836,Food,USD,0,-0.031571,-0.311084,0.123984
999977640,Film & Video,USD,0,-0.036079,-0.547058,-1.312455
999986353,Film & Video,USD,0,-0.023906,0.947445,1.919533
999987933,Technology,USD,0,-0.023906,-0.232426,-0.953345


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

parameters = {'C':[1, 5, 10]}
model = LinearSVC()
clf = GridSearchCV(model, parameters, cv=3)
clf.fit(X_train, y_train)
print(clf.best_params_, clf.best_score_)



{'C': 1} 0.6122797949778179




In [18]:
# 最適パラメータを用いて識別する
clf2 = LinearSVC(**clf.best_params_)
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)



In [20]:
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f_1 = f1_score(y_test, y_pred)

print(f'正解率: {acc:.3}')
print(f'Precision: {precision:.3}')
print(f'Recall: {recall:.3}')
print(f'F1: {f_1:.3}')

正解率: 0.611
Precision: 0.676
Recall: 0.0743
F1: 0.134


### Day1の結果

正答率（Accuracy） = 49.573%  
適合率（Precision） = 40.058%  
再現率（Recall） = 85.701%  
F1値（F1-score） = 54.596%  