# 라이브러리 불러오기

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 데이터 불러오기

In [10]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')
submission_df = pd.read_csv('./data/sample_submission.csv')

display(train_df.head())
display(test_df.head())

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1


Unnamed: 0,ID,first_party,second_party,facts
0,TEST_0000,Salerno,United States,The 1984 Bail Reform Act allowed the federal c...
1,TEST_0001,Milberg Weiss Bershad Hynes and Lerach,"Lexecon, Inc.",Lexecon Inc. was a defendant in a class action...
2,TEST_0002,No. 07-582\t Title: \t Federal Communications ...,"Fox Television Stations, Inc., et al.","In 2002 and 2003, Fox Television Stations broa..."
3,TEST_0003,Harold Kaufman,United States,During his trial for armed robbery of a federa...
4,TEST_0004,Berger,Hanlon,"In 1993, a magistrate judge issued a warrant a..."


데이터 크기 확인

In [11]:
print('Train Shape : {}, Test Shape : {}'.format(train_df.shape, test_df.shape))

Train Shape : (2478, 5), Test Shape : (1240, 4)


# 데이터 전처리

## 데이터 나누기

In [12]:
X_train = train_df[['first_party', 'second_party', 'facts']]
y_train = train_df['first_party_winner']
X_test = test_df.drop(['ID'], axis = 1)

display(X_train)
display(y_train)
display(X_test)

Unnamed: 0,first_party,second_party,facts
0,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ..."
1,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...
2,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...
3,Linkletter,Walker,Victor Linkletter was convicted in state court...
4,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud..."
...,...,...,...
2473,"HollyFrontier Cheyenne Refining, LLC, et al.","Renewable Fuels Association, et al.",Congress amended the Clean Air Act through the...
2474,"Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc.","Alliance Bond Fund, Inc., an investment fund, ..."
2475,Peguero,United States,"In 1992, the District Court sentenced Manuel D..."
2476,Immigration and Naturalization Service,St. Cyr,"On March 8, 1996, Enrico St. Cyr, a lawful per..."


0       1
1       0
2       1
3       0
4       1
       ..
2473    1
2474    1
2475    0
2476    0
2477    0
Name: first_party_winner, Length: 2478, dtype: int64

Unnamed: 0,first_party,second_party,facts
0,Salerno,United States,The 1984 Bail Reform Act allowed the federal c...
1,Milberg Weiss Bershad Hynes and Lerach,"Lexecon, Inc.",Lexecon Inc. was a defendant in a class action...
2,No. 07-582\t Title: \t Federal Communications ...,"Fox Television Stations, Inc., et al.","In 2002 and 2003, Fox Television Stations broa..."
3,Harold Kaufman,United States,During his trial for armed robbery of a federa...
4,Berger,Hanlon,"In 1993, a magistrate judge issued a warrant a..."
...,...,...,...
1235,"Haitian Centers Council, Inc., et al.","Chris Sale, Acting Commissioner, Immigration A...",According to Executive Order No. 12807 signed ...
1236,Whitman,"American Trucking Associations, Inc.",Section 109(a) of the Clean Air Act (CAA) requ...
1237,Linda A. Matteo and John J. Madigan,William G. Barr,Linda Matteo and John Madigan created a plan f...
1238,Washington State Apple Advertising Commission,Hunt,"In 1972, the North Carolina Board of Agricultu..."


## TF-IDF 기반으로 벡터화

In [13]:
tfidf_vect = TfidfVectorizer()

def vect(vect_model, df, is_train):
    if is_train:
        X_facts = vect_model.fit_transform(df['facts'])
    else:
        X_facts = vect_model.transform(df['facts'])
    X_first = vect_model.transform(df['first_party'])
    X_second = vect_model.transform(df['second_party'])
    
    # 희소 행렬을 밀집행렬로 변경
    X = np.concatenate([X_first.toarray(), X_second.toarray(),
                        X_facts.toarray()], axis = 1)
    return X

In [14]:
X_train = vect(tfidf_vect, X_train, True)
X_test = vect(tfidf_vect, X_test, False)

display(X_train)
display(X_test)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# 모델링

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                 random_state = 42, test_size = 0.3)

In [16]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
print(pred)

[0 1 1 ... 1 1 1]


# 제출

In [17]:
submission_df['first_party_winner'] = pred
submission_df.to_csv('basic_logi.csv', index = False)