# Preparing

In [1]:
import os
import pandas as pd

import pickle

# Configuration

In [2]:
model_patch = {}

# *** Datasets ***
zipped_stream_path = 'content/zipped/stream2/'
stream_path = 'content/stream/stream2/'
# zipped_stream_path = 'content/zipped/test/'
# stream_path = 'content/stream/test/'

# *** Classifieng models ***
model_patch["rfc"] = "content/models/random_forest_classifier.pickle"
model_patch["sgd"] = "content/models/sgd_classifier_tea.pickle"

# *** For saving ***
save_patch = 'content/analyse/'
trash_path = 'content/analyse/'

# Dataset structure - used for the source datasets fields
req_str = 'text'

# Unzipped stream files

In [3]:
# import zipfile

# for entry in os.listdir(zipped_stream_path):
#     entry_path = os.path.join(zipped_stream_path, entry)
#     if os.path.isfile(entry_path):

#         with zipfile.ZipFile(entry_path, 'r') as zip_ref:
#             zip_ref.extractall(stream_path)

# Load data

In [4]:
stream_df = pd.DataFrame()

for entry in os.listdir(stream_path):
    if os.path.isfile(os.path.join(stream_path, entry)):
        
        tmp_stream = pd.read_csv(
            stream_path + entry, 
            encoding = 'utf8', 
            header=None,
            sep='\\n',
            engine='python'
        )
        
        stream_df = pd.concat(
            [stream_df, tmp_stream], 
            sort=False
        ).sample(frac = 1).reset_index(drop=True)

print(stream_df.shape)

(85474408, 1)


# Preprocess data analysing

In [5]:
# drop NaN
trash_df = pd.DataFrame()
trash_df = stream_df[stream_df[0].isna()]
stream_df = stream_df.dropna()
trash_df

Unnamed: 0,0
14982,
221395,
357529,
404258,
645304,
...,...
85199457,
85218900,
85274694,
85373979,


In [6]:
# trash_df.shape

In [7]:
stream_df.describe(include='all')  

Unnamed: 0,0
count,85473472
unique,9371543
top,1
freq,400112


In [8]:
stream_df = stream_df.drop_duplicates()

In [9]:
stream_df.shape

(9371543, 1)

In [10]:
# By the customer info, the numbers type are useful if a request includes more than one word 
stream_df = stream_df.astype(str)

In [11]:
# Analysing 
req = stream_df[0].str

In [12]:
req_len = req.split().map(len)

In [13]:
req_chrs = req.len()

In [14]:
print("------")
print("Text basic statistics")
print("")
print('Total unique reqests counts:', len(stream_df[0].unique()))

------
Text basic statistics

Total unique reqests counts: 9371543


In [15]:
print("Mean amount of words in sentences (train dataset): ", req_len.mean())

Mean amount of words in sentences (train dataset):  4.156190181275378


In [16]:
print("Mean chars amount in sentences (train dataset): ", req_chrs.mean())

Mean chars amount in sentences (train dataset):  32.065118839021494


In [17]:
print("Max amount of words in sentences (train dataset): ", req_len.max())
print("Max chars amount in sentences (train dataset): ", req_chrs.max())
print("")
print("------")

Max amount of words in sentences (train dataset):  28081
Max chars amount in sentences (train dataset):  168480

------


In [18]:
count = stream_df[0].str.split().str.len()

trash_df = pd.concat(
            [
                trash_df, 
                stream_df[~(count!=1)].copy()
            ], 
            sort=False
        )


In [19]:
stream_df = stream_df.drop(stream_df[~(count!=1)].copy().index)

In [20]:
trash_df.head(-1)

Unnamed: 0,0
14982,
221395,
357529,
404258,
645304,
...,...
85351247,АНАРХОПУТИН
85360645,new+yorker+burger
85375011,тропки.ру
85388912,cbytdf


In [21]:
trash_df.shape

(1759400, 1)

In [22]:
stream_df.head(-1)

Unnamed: 0,0
0,народные умельцы россии
3,товар фактически поступил на ТТ 28.08.2021
4,ловушка для моли своими руками
5,социально экономическое развитие омской области
6,сом нижний тагил
...,...
85319430,мероприятия 25 июня в саранске
85360160,Прокофьев. Борович. ноты для кларнета и фортеп...
85383931,мемориальный комплекс жертв политических репре...
85393927,уход и размножение кактуса маммиллярия


In [23]:
# Too long is also strange
count = stream_df[0].str.split().str.len()

trash_df = pd.concat(
            [
                trash_df.copy(), 
                stream_df[~(count<15).copy()]
            ], 
            sort=False
        )

stream_df = stream_df.drop(stream_df[~(count<15)].copy().index)

In [24]:
trash_df.shape, stream_df.shape

((1887430, 1), (7485049, 1))

# Get Prediction

In [25]:
model = pickle.load(open(model_patch["sgd"], 'rb'))

In [26]:
model

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=1000,
                                 min_df=5, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 SGDClassifier(alpha=0.0001, average=False,
                               class_weight='balanced', early_stopping=False,
                               epsilon=0.1, eta0=0.0, fit_intercept=True,
                               l1_ratio=0.15, learning_rate='optimal',
                   

In [27]:
predicted_sgd_bin = model.predict_proba(stream_df[0])
predicted_sgd_bin

array([[0.3389957 , 0.6610043 ],
       [0.8636502 , 0.1363498 ],
       [0.99883238, 0.00116762],
       ...,
       [0.20304025, 0.79695975],
       [0.20304025, 0.79695975],
       [0.20304025, 0.79695975]])

In [28]:
len(predicted_sgd_bin), stream_df[0].shape

(7485049, (7485049,))

In [29]:
tmp2 = pd.DataFrame(predicted_sgd_bin)
tmp2.head(-1)

Unnamed: 0,0,1
0,0.338996,0.661004
1,0.863650,0.136350
2,0.998832,0.001168
3,0.203040,0.796960
4,0.203040,0.796960
...,...,...
7485043,0.203040,0.796960
7485044,0.998832,0.001168
7485045,0.462358,0.537642
7485046,0.203040,0.796960


# Get Results Probabilities

In [30]:
result = pd.DataFrame()

stream_df = stream_df.rename(columns={0: "query"})

result = pd.concat([
    stream_df["query"].copy().reset_index(drop=True), 
    pd.DataFrame(predicted_sgd_bin)], axis=1) 
result

Unnamed: 0,query,0,1
0,народные умельцы россии,0.338996,0.661004
1,товар фактически поступил на ТТ 28.08.2021,0.863650,0.136350
2,ловушка для моли своими руками,0.998832,0.001168
3,социально экономическое развитие омской области,0.203040,0.796960
4,сом нижний тагил,0.203040,0.796960
...,...,...,...
7485044,Прокофьев. Борович. ноты для кларнета и фортеп...,0.998832,0.001168
7485045,мемориальный комплекс жертв политических репре...,0.462358,0.537642
7485046,уход и размножение кактуса маммиллярия,0.203040,0.796960
7485047,sony xperia 5 ii аккумулятор,0.203040,0.796960


In [31]:
threshold = [1.0, 0.9, 0.8, 0.7]
for i in threshold:
    good = predicted_sgd_bin[:, 1]
    predicted_good = good > i
    result["P=" + str(i)] = predicted_good

In [32]:
result[result[1] > 0.9]

Unnamed: 0,query,0,1,P=1.0,P=0.9,P=0.8,P=0.7
15,расстояние белгород майский белгородская область,0.050217,0.949783,False,True,True,True
124,Дракон Арr,0.093569,0.906431,False,True,True,True
170,В тот серый скучный вечер аккорды,0.097870,0.902130,False,True,True,True
226,энергия гор чай,0.020400,0.979600,False,True,True,True
560,Брюса Липтон,0.057810,0.942190,False,True,True,True
...,...,...,...,...,...,...,...
7483504,ооо «эс ти кроун»,0.056429,0.943571,False,True,True,True
7483860,&search=Наклейка: StarLadder (золотая),0.078369,0.921631,False,True,True,True
7484019,промоакция нури,0.011415,0.988585,False,True,True,True
7484445,результаты выигрыша грин кард 2020,0.060950,0.939050,False,True,True,True


In [33]:
print("P=1")
print(result["P=1.0"].value_counts())
print("")
print("P=0.9")
print(result["P=0.9"].value_counts())
print("")
print("P=0.8")
print(result["P=0.8"].value_counts())
print("")
print("P=0.7")
print(result["P=0.7"].value_counts())

P=1
False    7485049
Name: P=1.0, dtype: int64

P=0.9
False    7474004
True       11045
Name: P=0.9, dtype: int64

P=0.8
False    7353364
True      131685
Name: P=0.8, dtype: int64

P=0.7
True     4568239
False    2916810
Name: P=0.7, dtype: int64


# Save files for analysing

In [34]:
result.shape

(7485049, 7)

In [35]:
i = int(result.shape[0] / 4)
result[:i].to_csv(save_patch + 'result1.gz', compression='gzip')
result[i:(2*i)].to_csv(save_patch + 'result2.gz', compression='gzip')
result[(2*i):(3*i)].to_csv(save_patch + 'result3.gz', compression='gzip')
result[(3*i):].to_csv(save_patch + 'result4.gz', compression='gzip')

In [36]:
result.to_csv(save_patch + 'result.gz', compression='gzip')

In [37]:
trash_df.to_csv(trash_path + 'trash.gz', compression='gzip')