# Court Decision Prediction Competition - Dacon

## David Euijoon Kim

---

### Library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re

pd.set_option('display.max_columns', None)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import *
from sklearn.model_selection import train_test_split

---

### Load Data

In [3]:
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2478 entries, 0 to 2477
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  2478 non-null   object
 1   first_party         2478 non-null   object
 2   second_party        2478 non-null   object
 3   facts               2478 non-null   object
 4   first_party_winner  2478 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 96.9+ KB


---

### Split

In [5]:
X=df['facts']
y=df['first_party_winner']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

---

### Vectorization

In [7]:
vectorizer = TfidfVectorizer()

X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

print(X_train_vect.shape)
print(X_test_vect.shape)

(1734, 14812)
(744, 14812)


---

### Naive Bayes

In [12]:
from sklearn.naive_bayes import MultinomialNB # The Naive Bayes algo

In [8]:
nb = MultinomialNB()
nb.fit(X_train_vect, y_train)

In [9]:
nb.class_count_

array([ 568., 1166.])

In [10]:
y_pred = nb.predict(X_test_vect)
accuracy_score(y_test, y_pred)

0.6491935483870968

---

### Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression

In [15]:
lr = LogisticRegression(penalty="none", solver='lbfgs')
lr.fit(X_train_vect, y_train)



In [16]:
y_pred = lr.predict(X_test_vect)
accuracy_score(y_test, y_pred)

0.5887096774193549

---

### Neural Network

In [17]:
from sklearn.neural_network import MLPClassifier

In [54]:
nn = MLPClassifier(hidden_layer_sizes=(2,2), activation='logistic', solver='lbfgs', random_state=42)
nn.fit(X_train_vect, y_train)

In [55]:
y_pred = nn.predict(X_test_vect)
accuracy_score(y_test, y_pred)

0.6491935483870968

#### GridSearch

In [57]:
from sklearn.model_selection import GridSearchCV

parameters = {'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
              'activation': ['tanh', 'relu', 'logistic', 'identity'],
              'solver': ['sgd', 'adam', 'lbfgs'],
              'alpha': [0.0001, 0.05],
              'learning_rate': ['constant','adaptive', 'invscaling'],
              }

mlp = MLPClassifier(max_iter=1000)

clf = GridSearchCV(mlp, parameters, n_jobs=-1, cv=5)

clf.fit(X_train_vect, y_train)


print('Best accuracy: ', clf.best_score_)
print('Parameter values to achieve best accuracy: ', clf.best_params_)



Best accuracy:  0.6730106111842215
Parameter values to achieve best accuracy:  {'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'invscaling', 'solver': 'sgd'}


In [None]:
# {'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'invscaling', 'solver': 'sgd'}