In [1]:
import re
import pickle
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
%%time
df_dialogues = pd.read_csv('./data/dialogues.tsv', sep='\t')
df_stackoverflow = pd.read_csv('./data/tagged_posts.tsv', sep='\t')

print(f'df_dialogues shape: {df_dialogues.shape}')
print(f'df_stackoverflow shape: {df_stackoverflow.shape}')

df_dialogues shape: (218609, 2)
df_stackoverflow shape: (2171575, 3)
CPU times: user 2.52 s, sys: 396 ms, total: 2.91 s
Wall time: 2.91 s


In [3]:
# Sample data
seed = 781
sample_size = 200000

df_stackoverflow = df_stackoverflow.sample(sample_size, random_state=seed)
df_stackoverflow.head()

Unnamed: 0,post_id,title,tag
631024,9071076,C++ virtual method overload/override compiler ...,c_cpp
353311,5298353,Check a condition and also identify the patter...,php
1547617,23947511,isset($_POST['x']) only works if the submit bu...,php
70588,1353559,Trying to make this star output using a for lo...,c_cpp
534998,7753016,"Django+Postgres: ""current transaction is abort...",python


In [4]:
def text_prepare(text):
    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
    bad_symbols_re = re.compile('[^0-9a-z #+_]')
    stopwords_set = set(stopwords.words('english'))

    text = text.lower()
    text = replace_by_space_re.sub(' ', text)
    text = bad_symbols_re.sub('', text)
    text = ' '.join([x for x in text.split() if x and x not in stopwords_set])

    return text.strip()

In [5]:
%%time
# Text Preprocessing
df_stackoverflow.title = df_stackoverflow.title.apply(text_prepare)

# Train test split
X, y = df_stackoverflow.title.values, df_stackoverflow.tag.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=seed)
print(f'Train size={len(X_train):,}, test size={len(X_test):,}')

# Load tf-idf and transform text
vectorizer = pickle.load(open('../api/src/dumps/tfidf_vectorizer.pkl', 'rb'))
X_train_tfidf, X_test_tfidf = vectorizer.transform(X_train), vectorizer.transform(X_test)

Train size=180,000, test size=20,000
CPU times: user 27.3 s, sys: 2.8 s, total: 30.1 s
Wall time: 30.1 s


In [6]:
%%time
# Train model
tag_classifier = OneVsRestClassifier(
    LogisticRegression(
        penalty='l2', C=5, 
        random_state=seed, 
        solver='liblinear'
    )
)
tag_classifier.fit(X_train_tfidf, y_train)

# Evaluate
y_pred = tag_classifier.predict(X_test_tfidf)
acc = metrics.accuracy_score(y_test, y_pred)
f1 = metrics.f1_score(y_test, y_pred, average='micro')
print(f'acc={acc*100:.3f}% - f1={f1*100:.3f}%')

# Save model
pickle.dump(tag_classifier, open('../api/src/dumps/tag_classifier.pkl', 'wb'))

acc=80.270% - f1=80.270%
CPU times: user 38.2 s, sys: 1min 30s, total: 2min 8s
Wall time: 18.7 s
