In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from pathlib import Path
import numpy as np
import newspaper
from itertools import groupby
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import re
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

from sklearn import svm
from sklearn.metrics import classification_report

In [4]:
with open('Data.pkl', 'rb') as input_file:
    Data = pickle.load(input_file)

In [2]:
ans = [el for el,_ in groupby(sorted([str(tag) for tag in Data['answer']]))]
Sentences_by_tag = {}
i=0
for Tag in ans:
    Sentences_by_tag[Tag] = i
    i=i+1
Documents = []
for Unique_tag in Sentences_by_tag.keys():
    Sentences_by_tag[Unique_tag] = []
    Sentences = ''
    i = 0
    for tag in Data['answer']:
        if str(tag) == Unique_tag:
            Sentences += ' '
            Sentences += str(Data['sentence'][i])
        i=i+1
    Sentences_by_tag[Unique_tag] = Sentences
    Documents.append(Sentences_by_tag[Unique_tag])

for i in range(len(Documents)):
    Documents[i] = re.sub(r"[\d+_]", "", Documents[i], flags=re.UNICODE)
vectorizer = TfidfVectorizer(lowercase = True, stop_words = stopwords.words('english')) 
matrix = vectorizer.fit_transform(Documents)


Positions = {}
j=0
for word in vectorizer.get_feature_names():
    Positions[word] = j
    j=j+1
amountWords = len(vectorizer.get_feature_names()) 
Tf_idf = pd.DataFrame(matrix.toarray(), columns = Positions.keys(), index = Sentences_by_tag.keys())
Tf_idf

Unnamed: 0,aaa,aachen,aaron,aas,ab,abadi,abandon,abandoned,abandoning,abandonment,...,zubaydah,zurdos,zvloqcxl,zvolensy,zwerling,ça,être,óscar,über,ülés
0,0.001721,0.000484,0.000484,0.003387,0.000484,0.0,0.002754,0.00241,0.000689,0.000689,...,0.000689,0.0,0.000484,0.000484,0.000484,0.000484,0.000484,0.000484,0.000484,0.000968
1,0.001265,0.0,0.0,0.0,0.0,0.000889,0.001898,0.003795,0.000633,0.001265,...,0.000633,0.000889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
with open('TF-IDF.pkl', 'wb') as output_file: 
    pickle.dump(Tf_idf, output_file, protocol=pickle.HIGHEST_PROTOCOL)
with open('Positions.pkl', 'wb') as output_file: 
    pickle.dump(Positions, output_file, protocol=pickle.HIGHEST_PROTOCOL)


In [5]:
with open('TF-IDF.pkl', 'rb') as input_file:
    Tf_idf = pickle.load(input_file)
with open('Positions.pkl', 'rb') as input_file:
    Positions = pickle.load(input_file)

In [6]:
def normalize(v):
    norm = np.linalg.norm(v)
    if norm == 0: 
        return v
    return v / norm

In [7]:
#Функция, превращающая предложение в список чисел в соответствии со схемой
def To_vector (Sentence, Positions, Tf_Idf):
    stolbec = np.zeros(len(Positions))
    if isinstance(Sentence, str):
        splittedSentence = re.split(r"[,.!?:;\"\'\(\)\%\=\#\&\@\$\-\*\+\>\<\/\ \s]", Sentence)
        splittedSentence = list(filter(None, splittedSentence)) 
        for word in splittedSentence:
            try:
                position = Positions[word.lower()]
                stolbec[position] += 1
            except KeyError:
                continue
        if len(splittedSentence) > 1:
            for c in range (len(splittedSentence)-1):
                pair = splittedSentence[c] + ' ' + splittedSentence[c+1]
                try:
                    position = Positions[word.lower()]
                    stolbec[position] += 1
                except KeyError:
                    continue
    return(normalize(Tf_Idf.dot(stolbec)))

In [8]:
#Превращаем тестовую выборку в вектора
def Create_vectors(Some_data, Positions, Tf_Idf):
    Test = []
    j=1
    x = len(Some_data)
    print("Vectorization in progress:")
    for i in Some_data:
        Test.append(To_vector(i, Positions, Tf_Idf))
        line = str(j) + '/' + str(x)
        print(line, end="\r")
        j += 1 
    return(Test)

In [91]:
X_train, X_test, Y_train, Y_test = train_test_split(Data['sentence'], Data['answer'], test_size=0.15, random_state=42)

In [92]:
line = []
onesIndex = []
ones = []

In [93]:
for row in Y_train.keys():
    if Y_train[row] == 1:
        onesIndex.append(row)
        line.append(X_train[row])

In [94]:
zerData = pd.DataFrame(list(X_train), columns = ['sentence'], index = X_train.keys()).join(pd.DataFrame(list(Y_train), columns = ['answer'], index = Y_train.keys()))
zerData

Unnamed: 0,sentence,answer
1744,How did Warren resolve this troubling issue?,0
15532,To deal with the perceived threat the US has d...,0
7284,The unnamed US citizen assigned to the consula...,0
6388,All presidents since Clinton have crossed coun...,0
811,The White House also announced Monday that it ...,0
3656,"To date, in addition to the three original sig...",0
3090,"The priest would kiss the boys and grope them,...",0
5066,Advertisement,0
12811,"- [And just two days ago, Archbishop Salvatore...",1
15458,It’s respecting the diversity that we have in ...,0


In [95]:
zerData = zerData.drop ([index for index in onesIndex],axis=0)
zerData

Unnamed: 0,sentence,answer
1744,How did Warren resolve this troubling issue?,0
15532,To deal with the perceived threat the US has d...,0
7284,The unnamed US citizen assigned to the consula...,0
6388,All presidents since Clinton have crossed coun...,0
811,The White House also announced Monday that it ...,0
3656,"To date, in addition to the three original sig...",0
3090,"The priest would kiss the boys and grope them,...",0
5066,Advertisement,0
15458,It’s respecting the diversity that we have in ...,0
14951,"In a series of tweets on Thursday, President T...",0


In [96]:
i = 0
while i < len(onesIndex):
    ones.append(1)
    i+=1

In [97]:
onesData = pd.DataFrame(line, columns = ['sentence'], index = onesIndex).join(pd.DataFrame(ones, columns = ['answer'], index = onesIndex))

In [98]:
onesData

Unnamed: 0,sentence,answer
12811,"- [And just two days ago, Archbishop Salvatore...",1
16282,That's what Columbia snowflakes thought was of...,1
13096,She noticed that Latino voters did record-brea...,1
10241,This was unjust.,1
15941,This massive Democrat voter fraud could be the...,1
7889,And Trump is president because he read the par...,1
15433,Assange never even considered fleeing to Russia.,1
9214,"But what, Mr. Farage, are we to think about th...",1
8546,It’s the very reason the governments of the wo...,1
1844,The toxic environment is due almost exclusivel...,1


In [100]:
with open('zerData.pkl', 'wb') as output_file: 
    pickle.dump(zerData, output_file, protocol=pickle.HIGHEST_PROTOCOL)
with open('onesData.pkl', 'wb') as output_file: 
    pickle.dump(onesData, output_file, protocol=pickle.HIGHEST_PROTOCOL)

In [122]:
with open('zerData.pkl', 'rb') as input_file: 
    zerData = pickle.load(input_file)
with open('onesData.pkl', 'rb') as input_file: 
    onesData = pickle.load(input_file)

In [123]:
DataParts = np.array_split(zerData, 5)

In [144]:
for DataPart in DataParts:
    print(pd.concat([DataPart,onesData], ignore_index=True))

                                               sentence  answer
0          How did Warren resolve this troubling issue?       0
1     To deal with the perceived threat the US has d...       0
2     The unnamed US citizen assigned to the consula...       0
3     All presidents since Clinton have crossed coun...       0
4     The White House also announced Monday that it ...       0
5     To date, in addition to the three original sig...       0
6     The priest would kiss the boys and grope them,...       0
7                                         Advertisement       0
8     It’s respecting the diversity that we have in ...       0
9     In a series of tweets on Thursday, President T...       0
10    A Muslima in the U.S. promoted the Islamic Sta...       0
11    We need the Sacrifice of the Mass – not a meal...       0
12    Democratic party leaders provided staffers wit...       0
13    No matter your race, creed, ethnicity, sexual ...       0
14    The emerging homosexual clergy sca

In [142]:
DataParts[0]

Unnamed: 0,sentence,answer
1744,How did Warren resolve this troubling issue?,0
15532,To deal with the perceived threat the US has d...,0
7284,The unnamed US citizen assigned to the consula...,0
6388,All presidents since Clinton have crossed coun...,0
811,The White House also announced Monday that it ...,0
3656,"To date, in addition to the three original sig...",0
3090,"The priest would kiss the boys and grope them,...",0
5066,Advertisement,0
15458,It’s respecting the diversity that we have in ...,0
14951,"In a series of tweets on Thursday, President T...",0


In [15]:
vec_train_x=Create_vectors(X_train, Positions, Tf_idf)


Vectorization in progress:
13852/13852

In [11]:
vec_test_x=Create_vectors(X_test, Positions, Tf_idf)

Vectorization in progress:
2445/2445

In [12]:
# knn = KNeighborsClassifier()

# grid=GridSearchCV(knn, param_grid={'Data': Data['sentence']})
# grid.fit(vec_train_x, Y_train)

In [26]:
with open('X_train.pkl', 'wb') as output_file: 
    pickle.dump(vec_train_x, output_file, protocol=pickle.HIGHEST_PROTOCOL)
with open('X_test.pkl', 'wb') as output_file: 
    pickle.dump(vec_test_x, output_file, protocol=pickle.HIGHEST_PROTOCOL)
with open('Y_train.pkl', 'wb') as output_file: 
    pickle.dump(Y_train, output_file, protocol=pickle.HIGHEST_PROTOCOL)
with open('Y_test.pkl', 'wb') as output_file: 
    pickle.dump(Y_test, output_file, protocol=pickle.HIGHEST_PROTOCOL)

In [41]:
clasas = xgb.XGBClassifier(max_depth = 10, n_estimators =1000)
clasas.fit(np.array(vec_train_x),np.array(Y_train))
clasas.score(np.array(vec_test_x),np.array(Y_test))

0.7132924335378323

In [42]:
print(classification_report(clasas.predict(vec_test_x), Y_test))

              precision    recall  f1-score   support

           0       0.87      0.76      0.81      1999
           1       0.32      0.51      0.40       446

   micro avg       0.71      0.71      0.71      2445
   macro avg       0.60      0.64      0.60      2445
weighted avg       0.77      0.71      0.74      2445



In [59]:
#import svm
#model = svm.svc(parameters)
#rbf(C = 3), линейная модель
#разделить выборку на 5 выборок, в каждой из которых 1/5 нулей и все единицы (см. код с хакатона)
#подготовить данные FLC
#ОБУЧИТЬ МОДЕЛЬ (FIT)
#ПРОВЕРИТЬ КАЧЕСТВО (score)
#classification_report
model = svm.SVC(C = 0.5,gamma = 'scale')
#grid=GridSearchCV(model, param_grid={'Sentence': Data['sentence'],'Answer': Data['answer']})
model.fit(X = (vec_train_x+vec_test_x),y = (list(Y_train)+list(Y_test)))

SVC(C=0.5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [56]:
model.score(vec_train_x+vec_test_x, list(Y_train)+list(Y_test))

0.7252255016260661

In [58]:
print(classification_report(model.predict(vec_train_x+vec_test_x), list(Y_train)+list(Y_test)))

              precision    recall  f1-score   support

           0       0.95      0.74      0.83     14824
           1       0.18      0.58      0.28      1473

   micro avg       0.73      0.73      0.73     16297
   macro avg       0.56      0.66      0.55     16297
weighted avg       0.88      0.73      0.78     16297

