In [1]:
import pandas as pd
import warnings 
warnings.filterwarnings(action='ignore')
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline

## 코사인유사도 측정

In [2]:
df = pd.read_csv('open/word.csv')
df.text = df.text.astype('str')
df

Unnamed: 0,text,author
0,he was almost choking there was so much so muc...,3
1,your sister asked for it i suppose,2
2,she was engaged one day as she walked in perus...,1
3,the captain was in the porch keeping himself c...,4
4,have mercy gentlemen odin flung up his hands ...,3
...,...,...
54874,is that you mr smith odin whispered i hardly ...,2
54875,i told my plan to the captain and between us w...,4
54876,your sincere well wisher friend and sister luc...,1
54877,then you wanted me to lend you money,3


In [3]:
sent = list(df.text.tolist())
sent

['he was almost choking there was so much so much he wanted to say but strange exclamations were all that came from his lips the pole gazed fixedly at him at the bundle of notes in his hand looked at odin and was in evident perplexity',
 'your sister asked for it i suppose',
 'she was engaged one day as she walked in perusing jane s last letter and dwelling on some passages which proved that jane had not written in spirits when instead of being again surprised by mr odin she saw on looking up that odin was meeting her putting away the letter immediately and forcing a smile she said',
 'the captain was in the porch keeping himself carefully out of the way of a treacherous shot should any be intended he turned and spoke to us doctors watch on the lookout dr odin take the north side if you please jim the east gray west the watch below all hands to load muskets lively men and careful',
 'have mercy gentlemen odin flung up his hands  don t write that anyway have some shame here i ve torn my

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(sent)

idf = tfidf_vectorizer.idf_
print(dict(zip(tfidf_vectorizer.get_feature_names(), idf)))



In [5]:
print(tfidf_matrix.shape)

(54879, 33870)


In [6]:
# 코사인 유사도

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

novel_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
novel_sim

array([[1.        , 0.        , 0.00735499, ..., 0.00833741, 0.10811559,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.2187648 , 0.        ,
        0.        ],
       [0.00735499, 0.        , 1.        , ..., 0.01331629, 0.        ,
        0.01697013],
       ...,
       [0.00833741, 0.2187648 , 0.01331629, ..., 1.        , 0.        ,
        0.        ],
       [0.10811559, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.01697013, ..., 0.        , 0.        ,
        1.        ]])

In [8]:
novel_x = novel_sim[:5000]
novel_x.shape

(5000, 54879)

In [9]:
author_x = df.author[:5000]
author_x.shape

(5000,)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
                train_test_split(novel_x, author_x, test_size=0.2, random_state=13,
                                stratify=author_x)

In [11]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [12]:
from sklearn.metrics import accuracy_score

train_pred = knn.predict(X_train)
test_pred = knn.predict(X_test)

print(accuracy_score(y_train, train_pred))
print(accuracy_score(y_test, test_pred))

0.5675
0.341


In [15]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_train, train_pred))

print('---' * 15)
print(classification_report(y_test, test_pred))

              precision    recall  f1-score   support

           0       0.65      0.66      0.65      1009
           1       0.74      0.39      0.51       497
           2       0.41      0.79      0.54       814
           3       0.75      0.51      0.61      1102
           4       0.51      0.37      0.43       578

    accuracy                           0.57      4000
   macro avg       0.61      0.54      0.55      4000
weighted avg       0.62      0.57      0.57      4000

---------------------------------------------
              precision    recall  f1-score   support

           0       0.40      0.37      0.38       252
           1       0.44      0.15      0.22       124
           2       0.27      0.65      0.38       204
           3       0.51      0.26      0.34       275
           4       0.26      0.18      0.21       145

    accuracy                           0.34      1000
   macro avg       0.38      0.32      0.31      1000
weighted avg       0.39      0.