In [13]:
# Settings
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [14]:
# Downloading paragraphs about cats
cats_data = pd.read_csv('cats.csv')
cats_data.columns = ['text']
cats_data.tail()

Unnamed: 0,text
32,Кошачьи приключения: Многие владельцы рассказы...
33,Котята и их обучение: Обучение котят основным ...
34,Кошачьи традиции: В некоторых культурах сущест...
35,Способы охлаждения: В жаркие дни кошки могут и...
36,Любовь к кошкам: В конечном итоге любовь к кош...


In [15]:
# Downloading paragraphs about dogs
dogs_data = pd.read_csv('dogs.csv')
dogs_data.columns = ['text']
dogs_data.tail()

Unnamed: 0,text
43,Собака может стать отличным мотиватором для фи...
44,"Многие владельцы собак отмечают, что их питомц..."
45,"Собаки могут стать настоящими членами семьи, п..."
46,Забота о собаке включает в себя не только физи...
47,Уникальная способность собак адаптироваться к ...


In [16]:
# Concatenation dataframes
df = pd.concat([cats_data, dogs_data])

# Labeling
df['target'] = ['cat' if x < len(cats_data) else 'dog' for x in range(len(cats_data)+len(dogs_data))]
df = df.reset_index()
df = df.drop('index', axis=1)

In [17]:
# Bringing the text to normal form
def normalize_text(text: str) -> str:
    result = ''
    symbols_to_delete = [',', '.', '-', '_', ':', "'", '—']
    text = text.lower() # All letters will be in lower
    
    # Remove unnecessary characters
    for i in range(len(text)):
        if text[i] not in symbols_to_delete:
            result = result + text[i]
    
    # Remove extra spaces
    result = result.replace('  ', ' ') 
    return result

In [18]:
# Create a list of unique words
def create_unique_words_list(df) -> np.array:
    df = np.array(df)
    unique_word_list = []
    for parag in df: # We go through paragraphs (texts)
        parag = parag.split(' ')
        for word in parag: # Through the words in the paragraph
            if word not in unique_word_list:
                unique_word_list.append(word)
    return unique_word_list
    

In [19]:
# Find the word index in the list of unique words
def find_word_index(word, word_list) -> int:
    return word_list.index(word)


In [20]:
# Function for text vectorization
def bag_of_words(df):
    df = np.array(df.loc[:, 'text'])
    for i in range(len(df)):
        df[i] = normalize_text(df[i]) # Normalize text

    word_list = create_unique_words_list(df) # Unique words

    # Output array of vectors
    vectorized_df = []
    # We go through paragraphs (texts)
    for parag in df:
        # Vector for current paragraph
        vectorized_parag = []
        parag = parag.split(' ')
        # Through the words in the paragraph
        for word in parag:
            vectorized_parag.append(find_word_index(word=word, word_list=word_list))
        vectorized_df.append(vectorized_parag)
    
    return (vectorized_df, word_list)




In [21]:
vectorized, word_list = bag_of_words(df)
df['text'] = vectorized
df

Unnamed: 0,text,target
0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",cat
1,"[23, 24, 25, 5, 26, 27, 24, 28, 29, 30, 31, 32...",cat
2,"[36, 28, 2, 42, 43, 44, 45, 12, 46, 47, 48, 16...",cat
3,"[56, 2, 57, 58, 59, 58, 60, 61, 62, 16, 63, 64...",cat
4,"[72, 73, 74, 2, 75, 76, 10, 77, 78, 79, 80, 81...",cat
...,...,...
80,"[776, 67, 669, 203, 777, 92, 778, 389, 484, 11...",dog
81,"[74, 401, 504, 347, 115, 82, 781, 179, 117, 78...",dog
82,"[454, 46, 669, 745, 787, 607, 788, 789, 10, 61...",dog
83,"[726, 403, 690, 727, 10, 518, 263, 489, 793, 1...",dog


In [22]:
# We bring dataframe into a form that can be fed to the model
# We go through each unique word
for i in range(len(word_list)):
    new_col = [] # New column
    # For each paragraph, we write down in the column of the corresponding word how many times it appears in this paragraph.
    for j in range(len(df.text)):
        current_obj = df.loc[j, 'text']
        cnt = current_obj.count(i)
        new_col.append(cnt)
    # Add a column for the current word
    df[word_list[i]] = new_col

df = df.drop('text', axis=1)
df

Unnamed: 0,target,история,одомашнивания,кошки,были,одомашнены,более,9,тысяч,лет,...,встречах,значимых,событиях,физический,отвечают,нее,взаимностью,способность,разными,стилями
0,cat,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,cat,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,cat,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,cat,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,cat,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,dog,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,dog,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
82,dog,0,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0
83,dog,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,1,0,0,0


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X = df.drop('target', axis=1)
y = df['target'].replace({'cat' : 0, 'dog' : 1})

X_train, X_test, y_train, y_test = train_test_split(X, y)

# Train a classification model and calculate an accuracy score
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {round(acc, 3)}")


Accuracy: 0.909


In [24]:
# Based on the magnitude of the coefficients, we show the most important words for each class
coefs = list(model.coef_[0])
sorted_coefs = sorted(coefs)
max_negative_coefs = sorted_coefs[:5]
max_positive_coefs = sorted_coefs[-5:]
max_coefs = max_negative_coefs + max_positive_coefs

l = []
for cf in max_coefs:
    idx = coefs.index(cf)
    word = word_list[idx]
    l.append(word)

print(
    f"The most important words for cats: {l[:5]}\nThe most important words for dogs: {l[5:]}", 
)



The most important words for cats: ['кошки', 'кошек', 'кошачьи', 'от', 'до']
The most important words for dogs: ['делает', 'у', 'людей', 'собак', 'собаки']
