### Downsampling & deeplearning 적용하기

#### Module import & Load Data

In [46]:
# 메모리내 변수 제거

all = [var for var in globals() if var[0] != "_"]   # globals() 목록의 첫글자가 _ 로 시작하지 않는 자료의 리스트만 가져와서
for var in all:
    del globals()[var]

In [47]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# import wordcloud
import nltk
# import seaborn as sns
import re

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.utils import resample

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import plot_model

In [48]:
train1 = pd.read_csv('c:/data/project/train.csv')
test = pd.read_csv('c:/data/project/test.csv')

In [49]:
train_nlp = train1.copy()
train_nlp.info()
train_nlp.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2478 entries, 0 to 2477
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  2478 non-null   object
 1   first_party         2478 non-null   object
 2   second_party        2478 non-null   object
 3   facts               2478 non-null   object
 4   first_party_winner  2478 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 96.9+ KB


Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1


In [50]:
train_nlp['facts'] = train_nlp['first_party']+', '+train_nlp['second_party']+', '+train_nlp['facts']
train_nlp = train_nlp[['facts','first_party_winner']]
train_nlp

Unnamed: 0,facts,first_party_winner
0,"Phil A. St. Amant, Herman A. Thompson, On June...",1
1,"Stephen Duncan, Lawrence Owens, Ramon Nelson w...",0
2,"Billy Joe Magwood, Tony Patterson, Warden, et ...",1
3,"Linkletter, Walker, Victor Linkletter was conv...",0
4,"William Earl Fikes, Alabama, On April 24, 1953...",1
...,...,...
2473,"HollyFrontier Cheyenne Refining, LLC, et al., ...",1
2474,"Grupo Mexicano de Desarrollo, S. A., Alliance ...",1
2475,"Peguero, United States, In 1992, the District ...",0
2476,"Immigration and Naturalization Service, St. Cy...",0


#### DownSampling

In [51]:
subset_0 = train_nlp[train_nlp["first_party_winner"] == 0]
subset_1 = train_nlp[train_nlp["first_party_winner"] == 1]

subset_1_downsampled = resample(subset_1,
                                replace=False,
                                n_samples=829,
                                random_state=42)

train = pd.concat([subset_0, subset_1_downsampled])
train

Unnamed: 0,facts,first_party_winner
1,"Stephen Duncan, Lawrence Owens, Ramon Nelson w...",0
3,"Linkletter, Walker, Victor Linkletter was conv...",0
14,"James J. Thole, et al., U.S. Bank, N.A., et al...",0
16,"Plyler, Doe, A revision to the Texas education...",0
21,"Bassam Yacoub Salman, United States, Maher Kar...",0
...,...,...
788,"United States, Arnold Schwinn & Co., Schwinn C...",1
350,"Vaughan, Atkinson, The general maritime law of...",1
1628,"Florida, Joe Elton Nixon, A Florida court conv...",1
1820,"B. C. Foreman et al., Dallas County, Texas et ...",1


#### Stopwords

In [52]:
##### 불용어 처리 및 어간과 표제어 추출 함수 #####

def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
## clean (convert to lowercase and remove punctuations and characters and then strip)

    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    # text = re.sub(r'[^\w\s]', '', str(text).strip())

    # 정규식, 치환 문자열, src 문자열 입력 -> 해당 문자가 발견되지 않는 경우 space bar로 변환, 단어별 split을 위한 작업

    ## Tokenize (convert from string to list)
    lst_text = text.split()    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in
                    lst_stopwords]

    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]

    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]

    ## back to string from list
    text = " ".join(lst_text)
    return text

In [53]:
lst_stopwords = nltk.corpus.stopwords.words("english")
# NLTK가 정의한 영어 불용어 리스트를 리턴

train["facts"] = train["facts"].apply(lambda x: utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, lst_stopwords=lst_stopwords))
# nltk 기본 불용어를 제거한 결과
train

Unnamed: 0,facts,first_party_winner
1,stephen duncan lawrence owen ramon nelson ridi...,0
3,linkletter walker victor linkletter convicted ...,0
14,james j thole et al u bank na et al named plai...,0
16,plyler doe revision texas education law 1975 a...,0
21,bassam yacoub salman united state maher kara j...,0
...,...,...
788,united state arnold schwinn co schwinn cycle d...,1
350,vaughan atkinson general maritime law united s...,1
1628,florida joe elton nixon florida court convicte...,1
1820,b c foreman et al dallas county texas et al 19...,1


#### Apply TfidfVectorizer

In [54]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train['facts'])
y=train['first_party_winner']

In [55]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.3, random_state=42)

In [56]:
X_train

<1160x15010 sparse matrix of type '<class 'numpy.float64'>'
	with 85814 stored elements in Compressed Sparse Row format>

In [57]:
# print(X_train.shape)
X_train_dvs = pd.DataFrame(X_train)
X_train_dvs

# X_train_dvs=np.asarray(X_train)
# X_train_dvs

Unnamed: 0,0
0,"(0, 10474)\t0.1115998687596908\n (0, 14696)..."
1,"(0, 11929)\t0.11179863571710291\n (0, 2881)..."
2,"(0, 9468)\t0.12444728242513084\n (0, 11077)..."
3,"(0, 5921)\t0.10932260208195271\n (0, 12077)..."
4,"(0, 4557)\t0.11001011899737416\n (0, 3701)\..."
...,...
1155,"(0, 8304)\t0.12047528554752111\n (0, 3722)\..."
1156,"(0, 9173)\t0.09619653222961917\n (0, 967)\t..."
1157,"(0, 1194)\t0.11431800551817459\n (0, 11991)..."
1158,"(0, 11867)\t0.07517371274162232\n (0, 660)\..."


## Define Model & Train

In [58]:
# define Dense layer model
def simple_model():
  model =  keras.Sequential([
      layers.Dropout(0.3),
      layers.BatchNormalization(),      
      layers.Dense(256, activation="relu", kernel_regularizer="l1"),
      layers.Dropout(0.3),
      layers.BatchNormalization(),
      layers.Dense(512, activation="relu", kernel_regularizer="l1"),
      layers.Dropout(0.4),
      layers.BatchNormalization(),
      layers.Dense(256, activation="relu"),
      layers.Dropout(0.4),
      layers.BatchNormalization(),
      layers.Dense(64, activation="relu"),
      layers.Dropout(0.4),
      layers.BatchNormalization(),      
      layers.Dense(1, activation="sigmoid"),     
    ])
  model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

  return model

In [59]:
# train and cross validate the model

k = 4
print(len(X_train_dvs))
num_validation_samples = len(X_train_dvs) // k
X_train[:num_validation_samples * 1].shape, X_train.shape

1160


((290, 15010), (1160, 15010))

In [60]:
num_epochs = 30
batch_sizes = 50
all_loss_histories = []
all_val_loss_histories = []  
all_acc_histories = []
all_val_acc_histories = []
X_train_test = X_train.toarray()
# Y_train_test = 

# For each validation fold, we will train a full set of epochs, and store the history. 
for fold in range(1,k):
    validation_data = X_train_test[num_validation_samples * fold:
                        num_validation_samples * (fold + 1)]
    validation_targets = y_train[num_validation_samples * fold:
                        num_validation_samples * (fold + 1)]
    training_data = np.concatenate([
        X_train_test[:num_validation_samples *fold],
        X_train_test[num_validation_samples * (fold + 1):]])
    training_targets = np.concatenate([
        y_train[:num_validation_samples * fold],
        y_train[num_validation_samples * (fold + 1):]])
    # training_data = training_data.todense()
    # training_targets = training_targets.toarray()
    model = simple_model()

    history = model.fit(training_data, training_targets,  
                        validation_data = (validation_data,validation_targets), 
                        epochs=num_epochs, 
                        batch_size=batch_sizes
                        )

#training_data


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 2

In [61]:
# keras.utils.plot_model(model,show_shapes=True)

In [62]:
# from sklearn.metrics import roc_curve
# y_pred = model.predict(X_eval.toarray()).ravel()
# fpr_1, tpr_1, thresholds_1 = roc_curve(y_eval, y_pred)

# from sklearn.metrics import auc
# auc_1 = auc(fpr_1, tpr_1)

# plt.figure(1)
# plt.plot([0, 1], [0, 1], 'k--')
# plt.plot(fpr_1, tpr_1, label='Dense layer model (area = {:.3f})'.format(auc_1))
# plt.xlabel('False positive rate')
# plt.ylabel('True positive rate')
# plt.title('ROC curve')
# plt.legend(loc='best')
# plt.show()

In [63]:
# evaluate the model by using test data
loss_metrics = model.evaluate(X_eval.toarray(),y_eval,verbose=1)



### Test csv 적용

In [64]:
test.head()

Unnamed: 0,ID,first_party,second_party,facts
0,TEST_0000,Salerno,United States,The 1984 Bail Reform Act allowed the federal c...
1,TEST_0001,Milberg Weiss Bershad Hynes and Lerach,"Lexecon, Inc.",Lexecon Inc. was a defendant in a class action...
2,TEST_0002,No. 07-582\t Title: \t Federal Communications ...,"Fox Television Stations, Inc., et al.","In 2002 and 2003, Fox Television Stations broa..."
3,TEST_0003,Harold Kaufman,United States,During his trial for armed robbery of a federa...
4,TEST_0004,Berger,Hanlon,"In 1993, a magistrate judge issued a warrant a..."


In [65]:
test['facts'] = test['first_party']+', '+test['second_party']+', '+test['facts']
dfTest=test['facts']
dfTest

0       Salerno, United States, The 1984 Bail Reform A...
1       Milberg Weiss Bershad Hynes and Lerach, Lexeco...
2       No. 07-582\t Title: \t Federal Communications ...
3       Harold Kaufman , United States, During his tri...
4       Berger, Hanlon, In 1993, a magistrate judge is...
                              ...                        
1235    Haitian Centers Council, Inc., et al., Chris S...
1236    Whitman, American Trucking Associations, Inc.,...
1237    Linda A. Matteo and John J. Madigan, William G...
1238    Washington State Apple Advertising Commission,...
1239    Theodore Stovall, Wilfred Denno, Warden, On Au...
Name: facts, Length: 1240, dtype: object

In [66]:
dfTest = pd.DataFrame(dfTest)
dfTest["facts"] = dfTest["facts"].apply(lambda x: utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, lst_stopwords=lst_stopwords))
dfTest['first_party_winner'] = np.zeros(len(dfTest)).astype(int)
dfTest

Unnamed: 0,facts,first_party_winner
0,salerno united state 1984 bail reform act allo...,0
1,milberg wei bershad hynes lerach lexecon inc l...,0
2,07582 title federal communication commission e...,0
3,harold kaufman united state trial armed robber...,0
4,berger hanlon 1993 magistrate judge issued war...,0
...,...,...
1235,haitian center council inc et al chris sale ac...,0
1236,whitman american trucking association inc sect...,0
1237,linda matteo john j madigan william g barr lin...,0
1238,washington state apple advertising commission ...,0


In [67]:
vectorizer = TfidfVectorizer()
X_test = vectorizer.fit_transform(dfTest['facts'])
y_test=dfTest['first_party_winner']
X_test.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [68]:
y_pred_test = model.predict(X_test.toarray())
predcsv = pd.DataFrame(y_pred_test,columns=['first_party_winner'])
predcsv.value_counts()

ValueError: in user code:

    File "c:\Users\user\miniconda3\envs\gpu\lib\site-packages\keras\engine\training.py", line 1801, in predict_function  *
        return step_function(self, iterator)
    File "c:\Users\user\miniconda3\envs\gpu\lib\site-packages\keras\engine\training.py", line 1790, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\user\miniconda3\envs\gpu\lib\site-packages\keras\engine\training.py", line 1783, in run_step  **
        outputs = model.predict_step(data)
    File "c:\Users\user\miniconda3\envs\gpu\lib\site-packages\keras\engine\training.py", line 1751, in predict_step
        return self(x, training=False)
    File "c:\Users\user\miniconda3\envs\gpu\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\user\miniconda3\envs\gpu\lib\site-packages\keras\engine\input_spec.py", line 248, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer "sequential_9" (type Sequential).
    
    Input 0 of layer "batch_normalization_45" is incompatible with the layer: expected axis 1 of input shape to have value 15010, but received input with shape (None, 9039)
    
    Call arguments received:
      • inputs=tf.Tensor(shape=(None, 9039), dtype=float32)
      • training=False
      • mask=None


## Inference & Submission

In [None]:
submit = pd.read_csv('C:/data/project/sample_submission.csv')
submit['first_party_winner'] = predcsv
submit.to_csv('./sample_submission.csv', index=False)
print('Done')

Done
