In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('nlp_hackathon_search.csv')

In [3]:
df.columns

Index(['document', 'question', 'title', 'document_id'], dtype='object')

In [4]:
df.sample(5)

Unnamed: 0,document,question,title,document_id
50706,The concept's origins can potentially be trace...,When did Jewish law recognize copyright?,Intellectual_property,428f72f61bfe11ea8f2b656571b1b549
12273,At the time of Harold Evans' appointment as ed...,What is the name of the highest-selling tabloi...,The_Times,427072701bfe11ea8f2b656571b1b549
55505,Violent incidents occurred throughout the Pied...,The Cainhoy Incident occurred on what day?,"Charleston,_South_Carolina",429733881bfe11ea8f2b656571b1b549
31635,A comprehensive school is a state school that ...,What countries used comprehensive schools exte...,Comprehensive_school,426aa0d41bfe11ea8f2b656571b1b549
46071,The reproductive system of female insects cons...,Female insects reproductive system contain a p...,Insect,429348d61bfe11ea8f2b656571b1b549


In [5]:
df = df.dropna()

In [6]:
X_train = df['question']
Y_train = df['document_id']

In [7]:
import re
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9 ]", " ", string)
    string = re.sub(r"\'s", "", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"\s{2,}", " ",string)
    return string            

In [8]:
X_train = [clean_str(str(x)) for x in X_train]

In [9]:
words_count = {}
for t in X_train:
    for w in t.split():
        if w not in words_count:
            words_count[w] = 1
        else:
            words_count[w] += 1

In [10]:
words_count

{'What': 26685,
 'kind': 543,
 'of': 24402,
 'scores': 17,
 'did': 11433,
 'Twilight': 59,
 'Princess': 76,
 'receive': 189,
 'from': 2638,
 'many': 4024,
 'video': 113,
 'game': 209,
 'review': 25,
 'sources': 44,
 'is': 11373,
 'the': 44111,
 'date': 380,
 'that': 3142,
 'Cyrpus': 1,
 'attained': 7,
 'independence': 98,
 'How': 5885,
 'long': 574,
 'Lord': 50,
 'Salisbury': 3,
 'remain': 60,
 'as': 2575,
 'Prime': 60,
 'Minister': 83,
 'purpose': 128,
 'antibiotic': 12,
 'treatment': 43,
 'other': 1066,
 'species': 225,
 'can': 1095,
 'be': 2133,
 'seen': 106,
 'close': 80,
 'to': 13134,
 'shores': 5,
 'Norfolk': 111,
 'Island': 173,
 'Whit': 1,
 'what': 9330,
 'donors': 7,
 'does': 3128,
 'zinc': 100,
 'form': 350,
 'stable': 16,
 'complexes': 3,
 'companies': 128,
 'a': 7567,
 'judge': 61,
 'say': 212,
 'infringed': 2,
 'on': 3076,
 'Dr': 15,
 'Moustakas': 1,
 's': 6090,
 'prior': 96,
 'blue': 34,
 'light': 172,
 'patent': 60,
 'in': 15745,
 '2015': 126,
 'When': 3913,
 'was': 1274

In [11]:
most_common_words = sorted(words_count.items(), key=lambda x: x[1], reverse=True)[:3]

In [12]:
most_common_words

[('the', 44111), ('What', 26685), ('of', 24402)]

In [13]:
WORDS_TO_INDEX = {}
INDEX_TO_WORDS = {}
idx = 0
for w in words_count.keys():
    WORDS_TO_INDEX[w] = idx
    INDEX_TO_WORDS[idx] = w
    idx += 1

In [14]:
ALL_WORDS = WORDS_TO_INDEX.keys()

In [15]:
DICT_SIZE = len(ALL_WORDS)

In [16]:
DICT_SIZE

35525

In [17]:
def bag_of_words(text, words_to_index, dict_size):
    result_vector = np.zeros(dict_size)
    for w in text.split():
        if w in words_to_index:
            result_vector[words_to_index[w]] = 1
    return result_vector

In [18]:
print(bag_of_words('how was your day',{'hi':1,'how':2,'day':3},5))

[0. 0. 1. 1. 0.]


In [19]:
from scipy import sparse as sp_sparse

In [20]:
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])

X_train shape  (62352, 35525)


In [21]:
print('Bag ofwords : X_train shape : {} '.format(X_train_mybag.shape))

Bag ofwords : X_train shape : (62352, 35525) 


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
def tfidf_features(X):
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    X = tfidf_vectorizer.fit_transform(X)
    return X,tfidf_vectorizer

In [24]:
X_train_tfidf,tfidf_vectorizer = tfidf_features(X_train)
print('X_train shape ', X_train_tfidf.shape)

X_train shape  (62352, 31269)


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [25]:
print('TF_IDF : X_train shape : {} '.format(X_train_tfidf.shape))

TF_IDF : X_train shape : (62352, 31269) 


In [26]:
DOCUMENT_IDS = {}
for y in Y_train:
    if y not in DOCUMENT_IDS:
        DOCUMENT_IDS[y] = 1
    else:
        DOCUMENT_IDS[y] += 1

In [33]:
DOCUMENT_IDS

{'4296602a1bfe11ea8f2b656571b1b549': 6,
 '42840a9c1bfe11ea8f2b656571b1b549': 4,
 '4277df561bfe11ea8f2b656571b1b549': 15,
 '42902d7c1bfe11ea8f2b656571b1b549': 7,
 '4272a11c1bfe11ea8f2b656571b1b549': 4,
 '4298c7e81bfe11ea8f2b656571b1b549': 3,
 '4282bf2a1bfe11ea8f2b656571b1b549': 3,
 '428ec63a1bfe11ea8f2b656571b1b549': 4,
 '42920a661bfe11ea8f2b656571b1b549': 4,
 '427d4cac1bfe11ea8f2b656571b1b549': 3,
 '426a5f841bfe11ea8f2b656571b1b549': 4,
 '4284079a1bfe11ea8f2b656571b1b549': 4,
 '42808cdc1bfe11ea8f2b656571b1b549': 4,
 '426dc41c1bfe11ea8f2b656571b1b549': 3,
 '426a3bee1bfe11ea8f2b656571b1b549': 4,
 '42854d761bfe11ea8f2b656571b1b549': 3,
 '428bbc421bfe11ea8f2b656571b1b549': 4,
 '428610bc1bfe11ea8f2b656571b1b549': 4,
 '42840c5e1bfe11ea8f2b656571b1b549': 4,
 '4272dfb01bfe11ea8f2b656571b1b549': 6,
 '427cd2901bfe11ea8f2b656571b1b549': 3,
 '4273c60a1bfe11ea8f2b656571b1b549': 4,
 '4284acea1bfe11ea8f2b656571b1b549': 4,
 '428b973a1bfe11ea8f2b656571b1b549': 4,
 '427eb2b81bfe11ea8f2b656571b1b549': 3,

In [27]:
from sklearn.preprocessing import LabelBinarizer

In [28]:
mlb = LabelBinarizer()
Y_train_label = mlb.fit_transform(Y_train)

In [29]:
len(Y_train_label)

62352

In [31]:
from sklearn import linear_model
from sklearn import metrics

In [33]:
lr = linear_model.LogisticRegression(C = 0.6, random_state = 1
                            , n_jobs = 8, solver="saga")
lr.fit(X_train_mybag, Y_train_label)
y_train_predicted_label = lr.predict(X_train_tfidf)

print("Training accuracy: ", metrics.accuracy_score(y_train_predicted_label, Y_train_label))

ValueError: bad input shape (62352, 15280)

In [34]:
from sklearn import tree

In [None]:
est = tree.DecisionTreeClassifier()
est.fit(X_train_tfidf, Y_train_label)

In [None]:
print("")print