# Preparing

In [1]:
import os
import pandas as pd

import pickle

# Configuration

In [2]:
model_patch = {}

# Target dataset (which is used for the true classification)
key_dataset = 'coffee.xlsx'
#dict_name = 'coffee_dict.xlsx'
non_coffee = 'content/stopwords/non-cofe.xlsx'

# *** Datasets ***
stream_path = 'content/stream/stream/'

# *** Classifieng models ***
model_patch["rfc"] = "content/models/random_forest_classifier.pickle"
model_patch["sgd"] = "content/models/sgd_classifier.pickle"

# *** For saving ***
save_patch = 'content/analyse/'
trash_path = 'content/analyse/'

# Dataset structure - used for the source datasets fields
req_str = 'text'

brand_weight_multiplicator = 64
non_category_weight_multiplicator = 1

# Load data

In [3]:
# *** Loadingf of dictionaries with brands and support words ***
stream_df = pd.DataFrame()

for entry in os.listdir(stream_path):
    if os.path.isfile(os.path.join(stream_path, entry)):
        
        tmp_stream = pd.read_csv(
            stream_path + entry, 
            encoding = 'utf8', 
            header=None,
            sep='\\n',
            engine='python'
        )
        
        stream_df = pd.concat(
            [stream_df, tmp_stream], 
            sort=False
        ).sample(frac = 1).reset_index(drop=True)

print(stream_df.shape)

(9472392, 1)


# Preprocess data analysing

In [4]:
print(stream_df.head(5))

                                                   0
0              отмена ограничений из за коронавируса
1                                       чернышевскмй
2  "конспект Понятие образовательного учреждения,...
3  В одну телегу впрячь неможно Коня и трепетную ...
4                             behringer volume pedal


In [5]:
# drop NaN
trash_df = pd.DataFrame()
trash_df = stream_df[stream_df[0].isna()]
stream_df = stream_df.dropna()
trash_df

Unnamed: 0,0
1433516,
1722715,
2074631,
7206712,
9194298,


In [6]:
trash_df.shape

(5, 1)

In [7]:
stream_df.describe(include='all')  

Unnamed: 0,0
count,9472387
unique,9371543
top,-
freq,16


In [8]:
# By the customer info, the numbers type are useful if a request includes more than one word 
stream_df = stream_df.astype(str)

In [9]:
count = stream_df[0].str.split().str.len()

trash_df = pd.concat(
            [
                trash_df, 
                stream_df[~(count!=1).copy()]
            ], 
            sort=False
        )

stream_df = stream_df.drop(stream_df[~(count!=1)].copy().index)

In [10]:
trash_df.head(-1)

Unnamed: 0,0
1433516,
1722715,
2074631,
7206712,
9194298,
...,...
9472361,CK0hjN8naA806QIHNi2k6idE7tVQBCr85RUKRnODzkbhb8...
9472365,Webisida.com
9472373,camowood
9472379,дщсфешщт


In [11]:
trash_df.shape

(1786933, 1)

In [12]:
stream_df.head(-1)

Unnamed: 0,0
0,отмена ограничений из за коронавируса
2,"""конспект Понятие образовательного учреждения,..."
3,В одну телегу впрячь неможно Коня и трепетную ...
4,behringer volume pedal
5,Русская философия ведет свою историю
...,...
9472385,новинский бульвар 3 стр.1
9472386,интеграл путеизмеритель
9472387,331069 nike
9472388,resident evil revelations 2 спец жпизод


In [13]:
# Too long is also strange
count = stream_df[0].str.split().str.len()

trash_df = pd.concat(
            [
                trash_df.copy(), 
                stream_df[~(count<12).copy()]
            ], 
            sort=False
        )

stream_df = stream_df.drop(stream_df[~(count<12)].copy().index)

In [14]:
trash_df.shape, stream_df.shape

((2015090, 1), (7457302, 1))

# Get Prediction

In [15]:
model = pickle.load(open(model_patch["sgd"], 'rb'))

In [16]:
model

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=1000,
                                 min_df=5, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 SGDClassifier(alpha=0.0001, average=False,
                               class_weight='balanced', early_stopping=False,
                               epsilon=0.1, eta0=0.0, fit_intercept=True,
                               l1_ratio=0.15, learning_rate='optimal',
                   

In [17]:
predicted_sgd_bin = model.predict_proba(stream_df[0])
predicted_sgd_bin

array([[0.64427849, 0.35572151],
       [0.21013956, 0.78986044],
       [0.21013956, 0.78986044],
       ...,
       [0.21013956, 0.78986044],
       [0.63677683, 0.36322317],
       [0.6620061 , 0.3379939 ]])

In [18]:
len(predicted_sgd_bin), stream_df[0].shape

(7457302, (7457302,))

In [19]:
tmp2 = pd.DataFrame(predicted_sgd_bin)
tmp2.head(-1)

Unnamed: 0,0,1
0,0.644278,0.355722
1,0.210140,0.789860
2,0.210140,0.789860
3,0.210140,0.789860
4,0.732658,0.267342
...,...,...
7457296,0.210140,0.789860
7457297,0.210140,0.789860
7457298,0.210140,0.789860
7457299,0.210140,0.789860


# Get Results Probabilities

In [20]:
result = pd.DataFrame()

stream_df = stream_df.rename(columns={0: "query"})

result = pd.concat([
    stream_df["query"].copy().reset_index(drop=True), 
    pd.DataFrame(predicted_sgd_bin)], axis=1) 
result

Unnamed: 0,query,0,1
0,отмена ограничений из за коронавируса,0.644278,0.355722
1,"""конспект Понятие образовательного учреждения,...",0.210140,0.789860
2,behringer volume pedal,0.210140,0.789860
3,Русская философия ведет свою историю,0.210140,0.789860
4,"""Конструктор деревянный \""Болид\"" site:akusher...",0.732658,0.267342
...,...,...,...
7457297,интеграл путеизмеритель,0.210140,0.789860
7457298,331069 nike,0.210140,0.789860
7457299,resident evil revelations 2 спец жпизод,0.210140,0.789860
7457300,ewm 2000 evo какой симистор на убл,0.636777,0.363223


In [21]:
threshold = [1.0, 0.9, 0.8, 0.7]
for i in threshold:
    good = predicted_sgd_bin[:, 1]
    predicted_good = good > i
    result["P=" + str(i)] = predicted_good

In [22]:
result

Unnamed: 0,query,0,1,P=1.0,P=0.9,P=0.8,P=0.7
0,отмена ограничений из за коронавируса,0.644278,0.355722,False,False,False,False
1,"""конспект Понятие образовательного учреждения,...",0.210140,0.789860,False,False,False,True
2,behringer volume pedal,0.210140,0.789860,False,False,False,True
3,Русская философия ведет свою историю,0.210140,0.789860,False,False,False,True
4,"""Конструктор деревянный \""Болид\"" site:akusher...",0.732658,0.267342,False,False,False,False
...,...,...,...,...,...,...,...
7457297,интеграл путеизмеритель,0.210140,0.789860,False,False,False,True
7457298,331069 nike,0.210140,0.789860,False,False,False,True
7457299,resident evil revelations 2 спец жпизод,0.210140,0.789860,False,False,False,True
7457300,ewm 2000 evo какой симистор на убл,0.636777,0.363223,False,False,False,False


In [23]:
result[result[1] > 0.7]

Unnamed: 0,query,0,1,P=1.0,P=0.9,P=0.8,P=0.7
1,"""конспект Понятие образовательного учреждения,...",0.210140,0.789860,False,False,False,True
2,behringer volume pedal,0.210140,0.789860,False,False,False,True
3,Русская философия ведет свою историю,0.210140,0.789860,False,False,False,True
5,захватчица рейд,0.210140,0.789860,False,False,False,True
6,алена петровская рябина черная,0.198699,0.801301,False,False,True,True
...,...,...,...,...,...,...,...
7457294,ив сен лораан фильм 2013,0.210140,0.789860,False,False,False,True
7457296,новинский бульвар 3 стр.1,0.210140,0.789860,False,False,False,True
7457297,интеграл путеизмеритель,0.210140,0.789860,False,False,False,True
7457298,331069 nike,0.210140,0.789860,False,False,False,True


In [24]:
print("P=1")
print(result["P=1.0"].value_counts())
print("")
print("P=0.9")
print(result["P=0.9"].value_counts())
print("")
print("P=0.8")
print(result["P=0.8"].value_counts())
print("")
print("P=0.7")
print(result["P=0.7"].value_counts())

P=1
False    7457302
Name: P=1.0, dtype: int64

P=0.9
False    7426806
True       30496
Name: P=0.9, dtype: int64

P=0.8
False    7399175
True       58127
Name: P=0.8, dtype: int64

P=0.7
True     4538787
False    2918515
Name: P=0.7, dtype: int64


# Save files for analysing

In [25]:
result.shape

(7457302, 7)

In [26]:
i = int(result.shape[0] / 4)
result[:i].to_csv(save_patch + 'result1.gz', compression='gzip')
result[i:(2*i)].to_csv(save_patch + 'result2.gz', compression='gzip')
result[(2*i):(3*i)].to_csv(save_patch + 'result3.gz', compression='gzip')
result[(3*i):].to_csv(save_patch + 'result4.gz', compression='gzip')

In [27]:
result.to_csv(save_patch + 'result.gz', compression='gzip')

In [28]:
trash_df.to_csv(trash_path + 'trash.gz', compression='gzip')