In [19]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from nltk import word_tokenize

import os
import random

# South Park

In [3]:
df = pd.read_csv('C:/Users/S451/Desktop/nlp/13.03_ml/SouthParkData-master/All-seasons.csv')

In [4]:
df.groupby('Character').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Episode,Line,Season
Character,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A Banana,count,1,1,1
A Banana,unique,1,1,1
A Banana,top,12,"Wow, Gangnamstein. I wish I'd have thought of ...",16
A Banana,freq,1,1,1
A Bishop,count,1,1,1
A Bishop,unique,1,1,1
A Bishop,top,11,"Uh, yes they are.\n",10
A Bishop,freq,1,1,1
A Boy,count,13,13,13
A Boy,unique,8,13,9


In [5]:
Cartman_lines = df[df['Character'] == 'Cartman']
Stan_lines = df[df['Character'] == 'Stan']
Kyle_lines = df[df['Character'] == 'Kyle']
Kenny_lines = df[df['Character'] == 'Kenny']
print(Cartman_lines.describe(), Stan_lines.describe(), Kyle_lines.describe(), Kenny_lines.describe())

       Season Episode Character     Line
count    9774    9774      9774     9774
unique     18      18         1     9340
top         4       7   Cartman  What?\n
freq      801     850      9774       52        Season Episode Character     Line
count    7680    7680      7680     7680
unique     18      18         1     6995
top         2      10      Stan  What?\n
freq      831     594      7680       73        Season Episode Character     Line
count    7099    7099      7099     7099
unique     18      18         1     6493
top         2       1      Kyle  What?\n
freq      824     635      7099       61        Season Episode Character       Line
count     881     881       881        881
unique     18      18         1        754
top         3       3     Kenny  (Yeah!)\n
freq      132     106       881         17


Реплик Кенни значительно меньше, а Картмана - больше, поэтому уравниваем данные

In [6]:
Cartman_lines = Cartman_lines[:800]
Stan_lines = Stan_lines[:800]
Kyle_lines = Kyle_lines[:800]
Kenny_lines = Kenny_lines[:800]
main_characters = pd.concat([Cartman_lines, Stan_lines, Kyle_lines, Kenny_lines])

In [7]:
main_characters.groupby('Character').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Episode,Line,Season
Character,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Cartman,count,800,800,800
Cartman,unique,13,789,2
Cartman,top,4,What?\n,10
Cartman,freq,130,4,574
Kenny,count,800,800,800
Kenny,unique,18,695,16
Kenny,top,1,(Yeah!)\n,3
Kenny,freq,93,15,132
Kyle,count,800,800,800
Kyle,unique,14,770,3


In [8]:
X_train, X_test, y_train, y_test = train_test_split(main_characters['Line'], main_characters['Character'], test_size=0.2)

Я не буду считать обсценную лексику как стоп-слова, так как она важна, и некоторые слова могут быть отличительными для некоторых персонажей

In [9]:
cv = CountVectorizer(stop_words="english")
X_trained = cv.fit_transform(X_train)
X_tested = cv.transform(X_test)

In [10]:
dc = DummyClassifier()
dc.fit(X_trained, y_train)
pred = dc.predict(X_tested)
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

    Cartman       0.25      0.26      0.26       144
      Kenny       0.28      0.27      0.27       161
       Kyle       0.30      0.28      0.29       181
       Stan       0.21      0.21      0.21       154

avg / total       0.26      0.26      0.26       640



Равновероятный выбор соблюдается

# Дальше

Для того, чтобы улучшить результат и помочь обучению, я попробую лемматизировать все слова

In [11]:
lem = WordNetLemmatizer()

def lemming(text):
    text = wordpunct_tokenize(text.lower())
    return [lem.lemmatize(i) for i in text]

In [12]:
cv = CountVectorizer(tokenizer=lemming, stop_words="english")
X_trained = cv.fit_transform(X_train)
X_tested = cv.transform(X_test)

### 1. Лес

In [13]:
rfcl = RandomForestClassifier()
rfcl.fit(X_trained,  y_train)
pred = rfcl.predict(X_tested)
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

    Cartman       0.44      0.42      0.43       144
      Kenny       0.99      0.96      0.97       161
       Kyle       0.43      0.36      0.39       181
       Stan       0.39      0.47      0.43       154

avg / total       0.56      0.55      0.56       640



### 2. Байес

In [14]:
naive_model = MultinomialNB()
naive_model.fit(X_trained,  y_train)
pred = naive_model.predict(X_tested)
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

    Cartman       0.52      0.63      0.57       144
      Kenny       0.99      0.98      0.98       161
       Kyle       0.54      0.38      0.44       181
       Stan       0.48      0.56      0.52       154

avg / total       0.63      0.63      0.63       640



### 3. Логит

In [15]:
lr = LogisticRegression()
lr.fit(X_trained,  y_train)
pred = lr.predict(X_tested)
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

    Cartman       0.54      0.47      0.50       144
      Kenny       1.00      0.98      0.99       161
       Kyle       0.47      0.45      0.46       181
       Stan       0.44      0.52      0.47       154

avg / total       0.61      0.60      0.61       640



Байес показывает лучшие результаты

Причем для Кенни результаты почти идеальные во всех трех моделях

Попробую другой векторайзер для Байеса

In [20]:
tivc = TfidfVectorizer(tokenizer=lemming, stop_words="english")
X_trained = tivc.fit_transform(X_train)
X_tested = tivc.transform(X_test)

naive_model = MultinomialNB()
naive_model.fit(X_trained,  y_train)
pred = naive_model.predict(X_tested)
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

    Cartman       0.53      0.58      0.55       144
      Kenny       0.97      0.97      0.97       161
       Kyle       0.56      0.40      0.46       181
       Stan       0.47      0.59      0.52       154

avg / total       0.63      0.63      0.63       640



Результат особо не отличается, только чуть более сгладился

В итоге: результат лучше, чем baseline