In [253]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.datasets import fetch_20newsgroups

In [254]:
# 3. Загрузка обучающей и тестовой выборки
remove = ('headers', 'footers', 'quotes')

all_categories = ['comp.windows.x', 'rec.sport.baseball', 'rec.sport.hockey']
train_bunch = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, categories=all_categories,remove=remove)
test_bunch = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, categories=all_categories, remove=remove)


def get_sample(bunch, category_idx):
    for idx, target in enumerate(bunch.target):
        if target == category_idx:
            return bunch.data[idx]

In [255]:
# 4. Вывод по одному документу каждого из классов
get_sample(train_bunch, all_categories.index('comp.windows.x'))

'See subject. An opportunity for sales-people (-persons? -entities?).\n\nI am looking for a commercial/PD graphics editor with fairly limited\nabilities that runs under X and preferably uses Motif widgets. It must\nrun on HP-UX version 9, either with or without the PEX extension. The\nsort of things I want are simple drawing, resizing and moving of objects\nsuch as lines, rectangles and text. Bounding rectangle operations are\nsufficient for object selection. Ideally it should also allow the\ncreation and placement of more complex objects such as widgets (e.g.\ntext entry fields or labels), but this is not mandatory.\n\nDoes anyone have such an animal? If you do, please mail me with details\nincluding price (especially run-time licensing since it must be\nincluded in a product). Alternatively, send glossies to me at Logica,\n68 Newman Street, London W1, including technical info please. For PD\nstuff, I have some effort that could be put into porting.\n\nThanks for any help,\nNicholas Y

In [256]:
get_sample(train_bunch, all_categories.index('rec.sport.baseball'))

"I'm looking for software (hopefully free and runs on Unix box) which will\nkeep track of statistics for my company softball team (batting avg. etc.).\n\nIf you know of any please post or respond to me by e-mail. Many thanks.\n"

In [257]:
get_sample(train_bunch, all_categories.index('rec.sport.hockey'))

'Everyone keeps talking about European expansion by 2010 thinking\nwishful thoughts, but being totally off the ball.\n\nThe league format we use here is incompatible with that in Europe.\n(for those that don\'t know, the best teams from lower divisions get\npromoted and the worst get demoted).\n\nWould European fans put up with our "if you\'ve paid, you can play" \nattitude??\n\nHow long would they support teams that are run on Ranger-based \ncorporate thinking (I use the term lightly).  (We don\'t need a good\nproduct because these duffuses in NYC would fill the arena for Ottawa\'s\nrecord every year......1940!! haha (sorry, had ta say it)).\n\nIf hockey (and other pro sports) had a similar system to Europe, maybe\nteams like the Rangers would be forced to compete (or get demoted the fourth \ndivision).  We\'d have many more teams...centres that aren\'t as big (like\nHalifax or Adirondack....ok, so Halifax isn\'t a good example) would \neventually get promoted, and every team would be

In [258]:
# 5. Стемминг
import nltk
from nltk.stem import *
from nltk import word_tokenize

nltk.download('punkt')


def stemming(documents: list[str]) -> list[str]:
    porter_stemmer = PorterStemmer()
    stem = []
    for document in documents:
        nltk_tokens = word_tokenize(document)
        line = ''
        for word in nltk_tokens:
            line += ' ' + porter_stemmer.stem(word)
        stem.append(line)
    return stem

train_tokenized = stemming(train_bunch.data)
test_tokenized = stemming(test_bunch.data)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dimon\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [259]:
# вывод 3 первых документов обучающих данных
train_tokenized[:3]

[" i 'm look for softwar ( hope free and run on unix box ) which will keep track of statist for my compani softbal team ( bat avg . etc. ) . if you know of ani pleas post or respond to me by e-mail . mani thank .",
 ' i am look for a sourc of american leagu basebal stat for individu player in the same format as print in newspap , ie . i do not want to provid a list of player and get back nice print report for $ 35 a week . doe anyon know of such statist avail and an idea of the cost ?',
 " everyon keep talk about european expans by 2010 think wish thought , but be total off the ball . the leagu format we use here is incompat with that in europ . ( for those that do n't know , the best team from lower divis get promot and the worst get demot ) . would european fan put up with our `` if you 've paid , you can play '' attitud ? ? how long would they support team that are run on ranger-bas corpor think ( i use the term lightli ) . ( we do n't need a good product becaus these duffus in nyc 

In [260]:
# вывод 3 первых документов тестовых данных
test_tokenized[:3]

[" i believ that phil esposito wa the first to wear # 77 when he play with the ranger in the '70 . thi wa the season that they put the ranger crest and `` modern-styl '' number on the jersey instead of the '' new york '' or `` ranger '' block letter and two-colour number . he took # 77 becaus the ranger alreadi had a # 7 . ( hockey night in canada made a big thing out of it , say it wa the biggest uniform style chang in a long time . thi wa befor pittsburgh and vancouv chang their colour . ) dwarf",
 " i 'm run hp-ux 8.07 with hp vue 2.01 and i need a hardcopi of the login window ( the one ask for login and password ) to includ in a manual . i have tri : xwd -display hostname:0 -root -out login.xwd from a login on a remot termin , but it doe n't work . xwd seem to wait for the window server , but the window server doe n't answer . ani suggest ? i must use xwd becaus i do n't have access to ftp and i ca n't obtain anoth program to grab the screen . thank in advanc nadia pitacco --",
 " 

In [261]:
columns = pd.MultiIndex.from_product([['Count', 'TF', 'TF-IDF'], ['Без стоп-слов', 'Со стоп-словами']])
df_train = pd.DataFrame(columns=columns)
df_test = pd.DataFrame(columns=columns)

df_train_stem = pd.DataFrame(columns=columns)
df_test_stem = pd.DataFrame(columns=columns)

In [262]:
# train count

vect = CountVectorizer(max_features=10000)
train_data = vect.fit_transform(train_bunch.data)


def get_20_freq_words(vect, data):
    words = list(zip(vect.get_feature_names_out(), np.ravel(data.sum(axis=0))))
    words.sort(key=lambda x: x[1], reverse=True)
    return words[:20]


count_column = get_20_freq_words(vect, train_data)
df_train['Count', 'Без стоп-слов'] = count_column
count_column

[('the', 15749),
 ('to', 7012),
 ('and', 5437),
 ('of', 5008),
 ('in', 4583),
 ('is', 3967),
 ('that', 3001),
 ('for', 2833),
 ('it', 2597),
 ('on', 2307),
 ('you', 2059),
 ('this', 2006),
 ('be', 1979),
 ('with', 1708),
 ('have', 1652),
 ('are', 1634),
 ('he', 1524),
 ('if', 1522),
 ('as', 1453),
 ('but', 1438)]

In [263]:
# train count stop
vect = CountVectorizer(max_features=10000, stop_words='english')
dtm = vect.fit_transform(train_bunch.data)

count_column_stop = get_20_freq_words(vect, dtm)
df_train['Count', 'Со стоп-словами'] = count_column_stop
count_column_stop

[('team', 679),
 ('game', 633),
 ('year', 629),
 ('file', 586),
 ('like', 582),
 ('10', 580),
 ('window', 573),
 ('edu', 544),
 ('use', 512),
 ('don', 502),
 ('just', 480),
 ('time', 466),
 ('new', 437),
 ('good', 434),
 ('think', 429),
 ('play', 427),
 ('season', 424),
 ('program', 417),
 ('games', 416),
 ('11', 415)]

In [264]:
# train tf

def get_20_freq_words_idf(feature_names, tfidf_values):
    result = []
    word_weights = dict(zip(feature_names, tfidf_values))
    sorted_words = sorted(word_weights.items(), key=lambda x: x[1], reverse=True)
    for word, weight in sorted_words[:20]:
        result.append((word, weight))
    return result


vectorizer = CountVectorizer(max_features=10000)
dtm = vectorizer.fit_transform(train_bunch.data)
tfidf = TfidfTransformer(use_idf=False).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_column = get_20_freq_words_idf(feature_names, tfidf_values)
df_train['TF', 'Без стоп-слов'] = tf_column
tf_column

[('the', 589.4840147265047),
 ('to', 282.9780269588262),
 ('and', 204.73973232096188),
 ('in', 181.95100839538162),
 ('of', 180.65052528156176),
 ('is', 151.84620034325926),
 ('that', 137.00256575679734),
 ('it', 117.94844823750009),
 ('for', 115.18977217608628),
 ('you', 97.72631057215757),
 ('this', 91.94123274499883),
 ('on', 90.5422925115908),
 ('have', 83.8896451423794),
 ('be', 79.4042202847746),
 ('with', 71.58514903092679),
 ('he', 68.66153507608368),
 ('but', 65.45399825662439),
 ('was', 64.89623089168877),
 ('if', 62.427980984910434),
 ('are', 60.70514880130126)]

In [265]:
# train tf stop
vectorizer = CountVectorizer(max_features=10000, stop_words='english')
dtm = vectorizer.fit_transform(train_bunch.data)
tfidf = TfidfTransformer(use_idf=False).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_column_stop = get_20_freq_words_idf(feature_names, tfidf_values)
df_train['TF', 'Со стоп-словами'] = tf_column_stop
tf_column_stop

[('like', 54.59034267706766),
 ('game', 52.65391186175265),
 ('team', 50.23374990934391),
 ('just', 47.839597900758605),
 ('don', 46.667780397895164),
 ('year', 46.66093560424913),
 ('know', 45.38244213178166),
 ('think', 44.39222815673452),
 ('time', 36.49174208101818),
 ('good', 35.62187455720601),
 ('does', 34.76996463457239),
 ('window', 34.42979502162383),
 ('thanks', 33.545069408891216),
 ('games', 33.05600445722264),
 ('use', 28.892433198026094),
 ('players', 28.10991175375138),
 ('play', 27.022282354210148),
 ('season', 26.63108713464234),
 ('way', 25.574312213576814),
 ('did', 24.382018454599265)]

In [266]:
# train tf-idf

vectorizer = CountVectorizer(max_features=10000)
dtm = vectorizer.fit_transform(train_bunch.data)
tfidf = TfidfTransformer(use_idf=True).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_idf = get_20_freq_words_idf(feature_names, tfidf_values)
df_train['TF-IDF', 'Без стоп-слов'] = tf_idf
tf_idf

[('the', 214.36387605105372),
 ('to', 113.78471180759847),
 ('and', 87.3636242603679),
 ('of', 83.5045406913417),
 ('in', 81.71464861099017),
 ('is', 72.71056685601074),
 ('that', 67.62424697731576),
 ('it', 60.63404118880459),
 ('you', 57.61452297032909),
 ('for', 56.101337507476245),
 ('he', 52.38043107664733),
 ('on', 48.851522087063),
 ('this', 48.544997294911376),
 ('have', 46.82516631808222),
 ('be', 45.129856667459215),
 ('was', 44.487884132057964),
 ('with', 41.34900152421049),
 ('they', 39.69318535367601),
 ('but', 38.536211763990764),
 ('are', 38.119468736590726)]

In [267]:
# train tf-idf stop

vectorizer = CountVectorizer(max_features=10000, stop_words='english')
dtm = vectorizer.fit_transform(train_bunch.data)
tfidf = TfidfTransformer(use_idf=True).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_idf_stop = get_20_freq_words_idf(feature_names, tfidf_values)
df_train['TF-IDF', 'Со стоп-словами'] = tf_idf_stop
tf_idf_stop

[('game', 30.55721189057097),
 ('team', 28.81541385536274),
 ('like', 27.880963215705588),
 ('year', 27.3154013693727),
 ('don', 25.993002073961122),
 ('know', 25.980732209617294),
 ('just', 25.498472063457278),
 ('think', 25.276905208763342),
 ('window', 24.08872014213688),
 ('games', 21.938746911155285),
 ('thanks', 21.22302451689207),
 ('time', 21.065804609712245),
 ('good', 21.056475539446414),
 ('does', 20.795041818869304),
 ('players', 19.029886083529593),
 ('use', 18.133972761035878),
 ('play', 17.5914301110468),
 ('season', 17.489935385470744),
 ('hockey', 17.123106074411094),
 ('server', 16.0242298941479)]

In [268]:
# test count
vect = CountVectorizer(max_features=10000)
dtm = vect.fit_transform(test_bunch.data)
count_test_column = get_20_freq_words(vect, dtm)
df_test['Count', 'Без стоп-слов'] = count_test_column
count_test_column

[('the', 8125),
 ('to', 4241),
 ('and', 3009),
 ('of', 2763),
 ('in', 2552),
 ('is', 2144),
 ('that', 1980),
 ('it', 1830),
 ('for', 1679),
 ('you', 1380),
 ('on', 1310),
 ('this', 1245),
 ('with', 1044),
 ('have', 1040),
 ('if', 944),
 ('be', 941),
 ('was', 926),
 ('or', 924),
 ('not', 892),
 ('but', 871)]

In [269]:
# test count stop

vect = CountVectorizer(max_features=10000, stop_words='english')
dtm = vect.fit_transform(test_bunch.data)
count_column_stop = get_20_freq_words(vect, dtm)
df_test['Count', 'Со стоп-словами'] = count_column_stop
count_column_stop

[('game', 605),
 ('dos', 519),
 ('don', 432),
 ('like', 349),
 ('just', 336),
 ('use', 321),
 ('time', 317),
 ('windows', 315),
 ('team', 304),
 ('25', 294),
 ('think', 282),
 ('year', 281),
 ('games', 270),
 ('know', 270),
 ('10', 260),
 ('window', 230),
 ('server', 220),
 ('file', 206),
 ('make', 205),
 ('good', 202)]

In [270]:
# test tf

vectorizer = CountVectorizer(max_features=10000)
dtm = vectorizer.fit_transform(test_bunch.data)
tfidf = TfidfTransformer(use_idf=False).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_column_test = get_20_freq_words_idf(feature_names, tfidf_values)
df_test['TF', 'Без стоп-слов'] = tf_column_test
tf_column_test

[('the', 388.19570382704524),
 ('to', 203.01933538943433),
 ('and', 135.44096301144648),
 ('of', 127.26078196096324),
 ('in', 123.03862800627307),
 ('is', 106.10502668837346),
 ('it', 97.61568506173221),
 ('that', 94.55469727290858),
 ('for', 77.47652053349819),
 ('on', 65.13454482147723),
 ('you', 61.12206519898555),
 ('this', 59.416586959842355),
 ('have', 53.33142911610357),
 ('he', 48.57608623328895),
 ('with', 48.380398864520345),
 ('but', 48.143361122840766),
 ('was', 47.90164582974453),
 ('not', 46.56227772713667),
 ('be', 46.025158234508034),
 ('if', 43.8648196202673)]

In [271]:
# test tf stop

vectorizer = CountVectorizer(max_features=10000, stop_words='english')
dtm = vectorizer.fit_transform(test_bunch.data)
tfidf = TfidfTransformer(use_idf=False).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_column_stop_test = get_20_freq_words_idf(feature_names, tfidf_values)
df_test['TF', 'Со стоп-словами'] = tf_column_stop_test
tf_column_stop_test

[('game', 55.35968033343396),
 ('don', 38.15044738570613),
 ('like', 35.205172019829035),
 ('just', 34.328123139106076),
 ('think', 29.786544763140988),
 ('know', 29.433164060511334),
 ('games', 28.94814382176937),
 ('time', 28.65709850103665),
 ('team', 27.00327166963311),
 ('year', 24.240307796853752),
 ('does', 23.091862495551748),
 ('thanks', 19.721712189483906),
 ('good', 19.614584707858924),
 ('baseball', 19.39262939380386),
 ('use', 19.19725192331969),
 ('way', 18.115307802883056),
 ('did', 17.952251306299022),
 ('ve', 17.94233444024097),
 ('hockey', 17.686119226431057),
 ('really', 16.820040021669158)]

In [272]:
# test tf-idf

vectorizer = CountVectorizer(max_features=10000)
dtm = vectorizer.fit_transform(test_bunch.data)
tfidf = TfidfTransformer(use_idf=True).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_idf_test = get_20_freq_words_idf(feature_names, tfidf_values)
df_test['TF-IDF', 'Без стоп-слов'] = tf_idf_test
tf_idf_test

[('the', 140.30113641970493),
 ('to', 79.53586144681118),
 ('and', 56.99451270506691),
 ('of', 55.27159601479053),
 ('in', 52.39293666308899),
 ('is', 48.310687059760724),
 ('that', 48.16799949109908),
 ('it', 47.13850839985433),
 ('for', 37.3901906909972),
 ('you', 36.762157024970904),
 ('he', 35.43897954887522),
 ('on', 33.9389379350505),
 ('this', 32.40886806943489),
 ('was', 31.237655377296534),
 ('have', 29.87915764367108),
 ('with', 27.752923300677498),
 ('be', 27.394383932350557),
 ('not', 27.23492650564159),
 ('but', 26.901540501389718),
 ('if', 26.13984301917541)]

In [273]:
# test tf-idf stop

vectorizer = CountVectorizer(max_features=10000, stop_words='english')
dtm = vectorizer.fit_transform(test_bunch.data)
tfidf = TfidfTransformer(use_idf=True).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_idf_stop_test = get_20_freq_words_idf(feature_names, tfidf_values)
df_test['TF-IDF', 'Со стоп-словами'] = tf_idf_stop_test
tf_idf_stop_test

[('game', 29.55200964153811),
 ('don', 20.119458203241464),
 ('like', 18.42244901502211),
 ('games', 18.374855394115595),
 ('just', 17.747367300014403),
 ('think', 16.700532672754676),
 ('team', 16.438774847774035),
 ('know', 16.043448881928967),
 ('time', 15.67295845607844),
 ('year', 14.959221513762234),
 ('does', 14.183153289591425),
 ('baseball', 13.800171778669297),
 ('thanks', 12.830586815959304),
 ('hockey', 12.649372505582647),
 ('good', 12.223839804874926),
 ('did', 11.84254390086542),
 ('ve', 11.584730948998143),
 ('use', 11.429258902553467),
 ('really', 11.114383277933392),
 ('way', 10.959152927237552)]

In [274]:
# train count stemming

vect = CountVectorizer(max_features=10000)
dtm = vect.fit_transform(train_tokenized)
count_column_stem = get_20_freq_words(vect, dtm)
df_train_stem['Count', 'Без стоп-слов'] = count_column_stem
count_column_stem

[('the', 15747),
 ('to', 7012),
 ('and', 5437),
 ('of', 5008),
 ('in', 4586),
 ('is', 4061),
 ('that', 3003),
 ('for', 2833),
 ('it', 2787),
 ('on', 2310),
 ('be', 2168),
 ('you', 2059),
 ('thi', 2006),
 ('have', 1793),
 ('with', 1708),
 ('are', 1674),
 ('he', 1524),
 ('if', 1522),
 ('not', 1467),
 ('as', 1453)]

In [275]:
# train count stemming stop
vect = CountVectorizer(max_features=10000, stop_words='english')
dtm = vect.fit_transform(train_tokenized)
count_column_stem_stop = get_20_freq_words(vect, dtm)
df_train_stem['Count', 'Со стоп-словами'] = count_column_stem_stop
count_column_stem_stop

[('thi', 2006),
 ('wa', 1448),
 ('use', 1130),
 ('game', 1037),
 ('team', 912),
 ('ha', 851),
 ('year', 817),
 ('file', 777),
 ('hi', 735),
 ('play', 732),
 ('window', 724),
 ('like', 657),
 ('ani', 621),
 ('run', 615),
 ('program', 600),
 ('10', 580),
 ('doe', 555),
 ('player', 548),
 ('edu', 544),
 ('time', 523)]

In [276]:
# train tf stemming

vectorizer = CountVectorizer(max_features=10000)
dtm = vectorizer.fit_transform(train_tokenized)
tfidf = TfidfTransformer(use_idf=False).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_column_stem = get_20_freq_words_idf(feature_names, tfidf_values)
df_train_stem['TF', 'Без стоп-слов'] = tf_column_stem
tf_column_stem

[('the', 578.9869871527349),
 ('to', 277.5105984525885),
 ('and', 200.61616514687864),
 ('in', 178.80949351346268),
 ('of', 177.27317313531762),
 ('is', 153.25072611761695),
 ('that', 134.5826880157837),
 ('it', 121.64081767176049),
 ('for', 113.09805914145427),
 ('you', 95.88651067888384),
 ('thi', 90.29679169755546),
 ('have', 89.47994881568444),
 ('on', 89.07860484463126),
 ('be', 87.08348630671308),
 ('with', 70.17819208957434),
 ('he', 67.48619409990057),
 ('wa', 67.13462099292734),
 ('but', 64.20796545216623),
 ('do', 62.54094183567242),
 ('if', 61.19748178568055)]

In [277]:
# train tf stemming stop

vectorizer = CountVectorizer(max_features=10000, stop_words='english')
dtm = vectorizer.fit_transform(train_tokenized)
tfidf = TfidfTransformer(use_idf=False).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_column_stem_stop = get_20_freq_words_idf(feature_names, tfidf_values)
df_train_stem['TF', 'Со стоп-словами'] = tf_column_stem_stop
tf_column_stem_stop

[('thi', 141.72279397640855),
 ('wa', 107.29607947837958),
 ('game', 77.4041331357221),
 ('ha', 62.647627160713306),
 ('team', 62.339132757617804),
 ('use', 60.14489569791756),
 ('hi', 59.43782499655776),
 ('ani', 57.279631549360936),
 ('year', 56.538602245273026),
 ('like', 54.51696717354279),
 ('play', 45.59072193731686),
 ('know', 45.53483739388707),
 ('doe', 45.42365375545463),
 ('think', 44.4952399710279),
 ('just', 43.87222983981162),
 ('window', 43.401407981085406),
 ('run', 42.12209183127551),
 ('player', 40.044459890660114),
 ('time', 36.87544216605833),
 ('onli', 36.00373220801312)]

In [278]:
# train tf-idf stemming

vectorizer = CountVectorizer(max_features=10000)
dtm = vectorizer.fit_transform(train_tokenized)
tfidf = TfidfTransformer(use_idf=True).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_idf_column = get_20_freq_words_idf(feature_names, tfidf_values)
df_train_stem['TF-IDF', 'Без стоп-слов'] = tf_idf_column
tf_idf_column

[('the', 215.27647742894334),
 ('to', 114.49060106699532),
 ('and', 87.69641536592607),
 ('of', 83.72973399779124),
 ('in', 82.02034876038448),
 ('is', 74.800742623246),
 ('that', 68.2059599251459),
 ('it', 63.377601730896856),
 ('you', 57.86497701244651),
 ('for', 56.681781334430156),
 ('he', 52.701288473690965),
 ('have', 50.23419071800302),
 ('on', 49.207504287444486),
 ('thi', 48.988062517955576),
 ('be', 48.987493499956045),
 ('wa', 46.48640038163983),
 ('with', 41.577284328675944),
 ('do', 40.25009158646201),
 ('they', 39.95380064531452),
 ('are', 38.934938887306956)]

In [279]:
# train tf-idf stemming stop

vectorizer = CountVectorizer(max_features=10000)
dtm = vectorizer.fit_transform(train_tokenized)
tfidf = TfidfTransformer(use_idf=True).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_idf_column_stop = get_20_freq_words_idf(feature_names, tfidf_values)
df_train_stem['TF-IDF', 'Со стоп-словами'] = tf_idf_column_stop
tf_idf_column_stop

[('the', 215.27647742894334),
 ('to', 114.49060106699532),
 ('and', 87.69641536592607),
 ('of', 83.72973399779124),
 ('in', 82.02034876038448),
 ('is', 74.800742623246),
 ('that', 68.2059599251459),
 ('it', 63.377601730896856),
 ('you', 57.86497701244651),
 ('for', 56.681781334430156),
 ('he', 52.701288473690965),
 ('have', 50.23419071800302),
 ('on', 49.207504287444486),
 ('thi', 48.988062517955576),
 ('be', 48.987493499956045),
 ('wa', 46.48640038163983),
 ('with', 41.577284328675944),
 ('do', 40.25009158646201),
 ('they', 39.95380064531452),
 ('are', 38.934938887306956)]

In [280]:
# test count stemming

vect = CountVectorizer(max_features=10000)
dtm = vect.fit_transform(test_tokenized)
count_column_stem_test = get_20_freq_words(vect, dtm)
df_test_stem['Count', 'Без стоп-слов'] = count_column_stem_test
count_column_stem_test

[('the', 8123),
 ('to', 4246),
 ('and', 3009),
 ('of', 2764),
 ('in', 2552),
 ('is', 2216),
 ('that', 1984),
 ('it', 1905),
 ('for', 1679),
 ('you', 1379),
 ('on', 1316),
 ('thi', 1244),
 ('have', 1146),
 ('be', 1056),
 ('with', 1044),
 ('wa', 977),
 ('if', 944),
 ('not', 932),
 ('or', 924),
 ('do', 880)]

In [281]:
# test count stemming stop

vect = CountVectorizer(max_features=10000, stop_words='english')
dtm = vect.fit_transform(test_tokenized)
count_column_stem_stop_test = get_20_freq_words(vect, dtm)
df_test_stem['Count', 'Со стоп-словами'] = count_column_stem_stop_test
count_column_stem_stop_test

[('thi', 1244),
 ('wa', 977),
 ('game', 870),
 ('use', 652),
 ('hi', 479),
 ('window', 433),
 ('ha', 431),
 ('dos', 417),
 ('ani', 404),
 ('run', 389),
 ('year', 388),
 ('like', 379),
 ('team', 375),
 ('time', 359),
 ('just', 337),
 ('doe', 318),
 ('think', 310),
 ('player', 306),
 ('play', 304),
 ('onli', 301)]

In [282]:
# test tf stemming

vectorizer = CountVectorizer(max_features=10000)
dtm = vectorizer.fit_transform(test_tokenized)
tfidf = TfidfTransformer(use_idf=False).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_column_stem_test = get_20_freq_words_idf(feature_names, tfidf_values)
df_test_stem['TF', 'Без стоп-слов'] = tf_column_stem_test
tf_column_stem_test

[('the', 381.61723786791123),
 ('to', 199.5821221490817),
 ('and', 133.05603026649428),
 ('of', 125.04781738537288),
 ('in', 120.91309421036907),
 ('is', 107.90773267419995),
 ('it', 99.96447529032932),
 ('that', 92.99129664614327),
 ('for', 76.11968441083106),
 ('on', 64.00897742734699),
 ('you', 59.99484538073765),
 ('have', 58.37687581833593),
 ('thi', 58.234131281067995),
 ('be', 51.28218160795185),
 ('wa', 49.252528830945565),
 ('not', 47.95830662939933),
 ('he', 47.80837384406617),
 ('with', 47.532592464961006),
 ('but', 47.29122334861563),
 ('game', 46.98806355389855)]

In [283]:
# test tf stemming stop

vectorizer = CountVectorizer(max_features=10000, stop_words='english')
dtm = vectorizer.fit_transform(test_tokenized)
tfidf = TfidfTransformer(use_idf=False).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_column_stem_stop_test = get_20_freq_words_idf(feature_names, tfidf_values)
df_test_stem['TF', 'Со стоп-словами'] = tf_column_stem_stop_test
tf_column_stem_stop_test

[('thi', 93.27440959800963),
 ('wa', 79.8682109176536),
 ('game', 76.88725169280045),
 ('hi', 43.0887412746446),
 ('use', 40.447015556149815),
 ('ha', 38.83926480239683),
 ('ani', 38.124190402485624),
 ('like', 34.54103012398732),
 ('year', 32.458925202858524),
 ('just', 31.7396596469489),
 ('doe', 31.018407132150937),
 ('team', 30.32434520439766),
 ('know', 29.68349324863127),
 ('time', 29.665787894946952),
 ('think', 29.632726492933937),
 ('play', 25.173256393356308),
 ('did', 24.69744214110062),
 ('run', 24.63068033842519),
 ('onli', 24.235961728890253),
 ('window', 23.090369109795542)]

In [284]:
# test tf-idf stemming

vectorizer = CountVectorizer(max_features=10000)
dtm = vectorizer.fit_transform(test_tokenized)
tfidf = TfidfTransformer(use_idf=True).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_idf_column_stem_test = get_20_freq_words_idf(feature_names, tfidf_values)
df_test_stem['TF-IDF', 'Без стоп-слов'] = tf_idf_column_stem_test
tf_idf_column_stem_test

[('the', 142.30736233271156),
 ('to', 80.8415255195825),
 ('and', 57.75737273696659),
 ('of', 55.919653319291164),
 ('in', 53.054633702797396),
 ('is', 50.28326657972924),
 ('it', 49.218057306394606),
 ('that', 48.83455932774926),
 ('for', 38.03878447829304),
 ('you', 37.268248972423386),
 ('he', 35.970994983076906),
 ('game', 35.10700042982108),
 ('on', 34.38324956592447),
 ('wa', 32.924298462463874),
 ('have', 32.828292294103306),
 ('thi', 32.758945153148524),
 ('be', 30.305989816973778),
 ('not', 28.33920610056298),
 ('with', 28.110970470103823),
 ('but', 27.312131093533925)]

In [285]:
# test tf-idf stemming stop

vectorizer = CountVectorizer(max_features=10000, stop_words='english')
dtm = vectorizer.fit_transform(test_tokenized)
tfidf = TfidfTransformer(use_idf=False).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_idf_column_stem_stop_test = get_20_freq_words_idf(feature_names, tfidf_values)
df_test_stem['TF-IDF', 'Со стоп-словами'] = tf_idf_column_stem_stop_test
tf_idf_column_stem_stop_test

[('thi', 93.27440959800963),
 ('wa', 79.8682109176536),
 ('game', 76.88725169280045),
 ('hi', 43.0887412746446),
 ('use', 40.447015556149815),
 ('ha', 38.83926480239683),
 ('ani', 38.124190402485624),
 ('like', 34.54103012398732),
 ('year', 32.458925202858524),
 ('just', 31.7396596469489),
 ('doe', 31.018407132150937),
 ('team', 30.32434520439766),
 ('know', 29.68349324863127),
 ('time', 29.665787894946952),
 ('think', 29.632726492933937),
 ('play', 25.173256393356308),
 ('did', 24.69744214110062),
 ('run', 24.63068033842519),
 ('onli', 24.235961728890253),
 ('window', 23.090369109795542)]

In [286]:
df_train

Unnamed: 0_level_0,Count,Count,TF,TF,TF-IDF,TF-IDF
Unnamed: 0_level_1,Без стоп-слов,Со стоп-словами,Без стоп-слов,Со стоп-словами,Без стоп-слов,Со стоп-словами
0,"(the, 15749)","(team, 679)","(the, 589.4840147265047)","(like, 54.59034267706766)","(the, 214.36387605105372)","(game, 30.55721189057097)"
1,"(to, 7012)","(game, 633)","(to, 282.9780269588262)","(game, 52.65391186175265)","(to, 113.78471180759847)","(team, 28.81541385536274)"
2,"(and, 5437)","(year, 629)","(and, 204.73973232096188)","(team, 50.23374990934391)","(and, 87.3636242603679)","(like, 27.880963215705588)"
3,"(of, 5008)","(file, 586)","(in, 181.95100839538162)","(just, 47.839597900758605)","(of, 83.5045406913417)","(year, 27.3154013693727)"
4,"(in, 4583)","(like, 582)","(of, 180.65052528156176)","(don, 46.667780397895164)","(in, 81.71464861099017)","(don, 25.993002073961122)"
5,"(is, 3967)","(10, 580)","(is, 151.84620034325926)","(year, 46.66093560424913)","(is, 72.71056685601074)","(know, 25.980732209617294)"
6,"(that, 3001)","(window, 573)","(that, 137.00256575679734)","(know, 45.38244213178166)","(that, 67.62424697731576)","(just, 25.498472063457278)"
7,"(for, 2833)","(edu, 544)","(it, 117.94844823750009)","(think, 44.39222815673452)","(it, 60.63404118880459)","(think, 25.276905208763342)"
8,"(it, 2597)","(use, 512)","(for, 115.18977217608628)","(time, 36.49174208101818)","(you, 57.61452297032909)","(window, 24.08872014213688)"
9,"(on, 2307)","(don, 502)","(you, 97.72631057215757)","(good, 35.62187455720601)","(for, 56.101337507476245)","(games, 21.938746911155285)"


In [287]:
df_test

Unnamed: 0_level_0,Count,Count,TF,TF,TF-IDF,TF-IDF
Unnamed: 0_level_1,Без стоп-слов,Со стоп-словами,Без стоп-слов,Со стоп-словами,Без стоп-слов,Со стоп-словами
0,"(the, 8125)","(game, 605)","(the, 388.19570382704524)","(game, 55.35968033343396)","(the, 140.30113641970493)","(game, 29.55200964153811)"
1,"(to, 4241)","(dos, 519)","(to, 203.01933538943433)","(don, 38.15044738570613)","(to, 79.53586144681118)","(don, 20.119458203241464)"
2,"(and, 3009)","(don, 432)","(and, 135.44096301144648)","(like, 35.205172019829035)","(and, 56.99451270506691)","(like, 18.42244901502211)"
3,"(of, 2763)","(like, 349)","(of, 127.26078196096324)","(just, 34.328123139106076)","(of, 55.27159601479053)","(games, 18.374855394115595)"
4,"(in, 2552)","(just, 336)","(in, 123.03862800627307)","(think, 29.786544763140988)","(in, 52.39293666308899)","(just, 17.747367300014403)"
5,"(is, 2144)","(use, 321)","(is, 106.10502668837346)","(know, 29.433164060511334)","(is, 48.310687059760724)","(think, 16.700532672754676)"
6,"(that, 1980)","(time, 317)","(it, 97.61568506173221)","(games, 28.94814382176937)","(that, 48.16799949109908)","(team, 16.438774847774035)"
7,"(it, 1830)","(windows, 315)","(that, 94.55469727290858)","(time, 28.65709850103665)","(it, 47.13850839985433)","(know, 16.043448881928967)"
8,"(for, 1679)","(team, 304)","(for, 77.47652053349819)","(team, 27.00327166963311)","(for, 37.3901906909972)","(time, 15.67295845607844)"
9,"(you, 1380)","(25, 294)","(on, 65.13454482147723)","(year, 24.240307796853752)","(you, 36.762157024970904)","(year, 14.959221513762234)"


In [288]:
df_train_stem

Unnamed: 0_level_0,Count,Count,TF,TF,TF-IDF,TF-IDF
Unnamed: 0_level_1,Без стоп-слов,Со стоп-словами,Без стоп-слов,Со стоп-словами,Без стоп-слов,Со стоп-словами
0,"(the, 15747)","(thi, 2006)","(the, 578.9869871527349)","(thi, 141.72279397640855)","(the, 215.27647742894334)","(the, 215.27647742894334)"
1,"(to, 7012)","(wa, 1448)","(to, 277.5105984525885)","(wa, 107.29607947837958)","(to, 114.49060106699532)","(to, 114.49060106699532)"
2,"(and, 5437)","(use, 1130)","(and, 200.61616514687864)","(game, 77.4041331357221)","(and, 87.69641536592607)","(and, 87.69641536592607)"
3,"(of, 5008)","(game, 1037)","(in, 178.80949351346268)","(ha, 62.647627160713306)","(of, 83.72973399779124)","(of, 83.72973399779124)"
4,"(in, 4586)","(team, 912)","(of, 177.27317313531762)","(team, 62.339132757617804)","(in, 82.02034876038448)","(in, 82.02034876038448)"
5,"(is, 4061)","(ha, 851)","(is, 153.25072611761695)","(use, 60.14489569791756)","(is, 74.800742623246)","(is, 74.800742623246)"
6,"(that, 3003)","(year, 817)","(that, 134.5826880157837)","(hi, 59.43782499655776)","(that, 68.2059599251459)","(that, 68.2059599251459)"
7,"(for, 2833)","(file, 777)","(it, 121.64081767176049)","(ani, 57.279631549360936)","(it, 63.377601730896856)","(it, 63.377601730896856)"
8,"(it, 2787)","(hi, 735)","(for, 113.09805914145427)","(year, 56.538602245273026)","(you, 57.86497701244651)","(you, 57.86497701244651)"
9,"(on, 2310)","(play, 732)","(you, 95.88651067888384)","(like, 54.51696717354279)","(for, 56.681781334430156)","(for, 56.681781334430156)"


In [289]:
df_test_stem

Unnamed: 0_level_0,Count,Count,TF,TF,TF-IDF,TF-IDF
Unnamed: 0_level_1,Без стоп-слов,Со стоп-словами,Без стоп-слов,Со стоп-словами,Без стоп-слов,Со стоп-словами
0,"(the, 8123)","(thi, 1244)","(the, 381.61723786791123)","(thi, 93.27440959800963)","(the, 142.30736233271156)","(thi, 93.27440959800963)"
1,"(to, 4246)","(wa, 977)","(to, 199.5821221490817)","(wa, 79.8682109176536)","(to, 80.8415255195825)","(wa, 79.8682109176536)"
2,"(and, 3009)","(game, 870)","(and, 133.05603026649428)","(game, 76.88725169280045)","(and, 57.75737273696659)","(game, 76.88725169280045)"
3,"(of, 2764)","(use, 652)","(of, 125.04781738537288)","(hi, 43.0887412746446)","(of, 55.919653319291164)","(hi, 43.0887412746446)"
4,"(in, 2552)","(hi, 479)","(in, 120.91309421036907)","(use, 40.447015556149815)","(in, 53.054633702797396)","(use, 40.447015556149815)"
5,"(is, 2216)","(window, 433)","(is, 107.90773267419995)","(ha, 38.83926480239683)","(is, 50.28326657972924)","(ha, 38.83926480239683)"
6,"(that, 1984)","(ha, 431)","(it, 99.96447529032932)","(ani, 38.124190402485624)","(it, 49.218057306394606)","(ani, 38.124190402485624)"
7,"(it, 1905)","(dos, 417)","(that, 92.99129664614327)","(like, 34.54103012398732)","(that, 48.83455932774926)","(like, 34.54103012398732)"
8,"(for, 1679)","(ani, 404)","(for, 76.11968441083106)","(year, 32.458925202858524)","(for, 38.03878447829304)","(year, 32.458925202858524)"
9,"(you, 1379)","(run, 389)","(on, 64.00897742734699)","(just, 31.7396596469489)","(you, 37.268248972423386)","(just, 31.7396596469489)"


In [291]:
# 8. Конвеер

from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

parameters = {
    'vect__max_features': (500, 1000, 2500, 5000, 10000, None),
    'vect__stop_words': ('english', None),
    'tfidf__use_idf': (True, False),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

grid_search.fit(train_bunch.data, train_bunch.target)

print("Best score: %0.3f" % grid_search.best_score_)
grid_search.best_params_

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best score: 0.929


{'tfidf__use_idf': True,
 'vect__max_features': None,
 'vect__stop_words': 'english'}

In [292]:
print(classification_report(grid_search.predict(train_bunch.data), train_bunch.target))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       596
           1       0.94      0.99      0.97       566
           2       0.99      0.95      0.97       628

    accuracy                           0.97      1790
   macro avg       0.97      0.98      0.97      1790
weighted avg       0.98      0.97      0.97      1790


In [293]:
print(classification_report(grid_search.predict(test_bunch.data), test_bunch.target))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95       393
           1       0.85      0.96      0.90       351
           2       0.97      0.87      0.91       447

    accuracy                           0.92      1191
   macro avg       0.92      0.93      0.92      1191
weighted avg       0.93      0.92      0.92      1191


In [294]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

parameters = {
    'vect__max_features': (500, 1000, 2500, 5000, 10000, None),
    'vect__stop_words': ('english', None),
    'tfidf__use_idf': (True, False),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

grid_search.fit(train_tokenized, train_bunch.target)

print("Best score: %0.3f" % grid_search.best_score_)
grid_search.best_params_

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best score: 0.925


{'tfidf__use_idf': True,
 'vect__max_features': None,
 'vect__stop_words': 'english'}

In [295]:
print(classification_report(grid_search.predict(train_tokenized), train_bunch.target))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       593
           1       0.94      0.99      0.97       566
           2       0.99      0.94      0.97       631

    accuracy                           0.97      1790
   macro avg       0.97      0.98      0.97      1790
weighted avg       0.98      0.97      0.97      1790


In [296]:
print(classification_report(grid_search.predict(test_tokenized), test_bunch.target))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       395
           1       0.85      0.96      0.90       351
           2       0.97      0.87      0.92       445

    accuracy                           0.93      1191
   macro avg       0.93      0.93      0.93      1191
weighted avg       0.93      0.93      0.93      1191
