In [1]:
import spacy
from sklearn.svm import LinearSVC
parser = spacy.load('en_core_web_lg')

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('../datasets/Insults.csv',header = 0)

In [4]:
print(df.head())

                                                text  intent
0  Ok... so you can't smack your kids on the bum ...  accept
1  Sitting behind a computer screen some people c...  reject
2  I don't know. My local newspaper if full of as...  reject
3  Alright I'm sorry that in my fit of internet r...  ignore
4  >there is a girl I have been seeing for the pa...  reject


In [5]:

from sklearn.model_selection import train_test_split
X = np.array(df['text'])
y = np.array(df['intent'])
X_train,X_test,y_train,y_test = train_test_split(X,y,shuffle = True)

In [6]:
train_data = np.array(list(zip(X_train,y_train)))
TRAIN = pd.DataFrame(train_data)
test_data = np.array(list(zip(X_test,y_test)))
TEST = pd.DataFrame(test_data)

In [7]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [8]:
def tokenizeText(sample):
    tokens = parser(sample)
    return tokens.vector
    
#     tokens = parser(sample)
#     lemmas = []
#     for tok in tokens:
#         lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
#     tokens = lemmas
#     #tokens = [word.lemma_ for word in tokens if word.lemma_ != "-PRON-"]
#     #tokens = [tok for tok in tokens if tok not in set(SYMBOLS)|STOP_WORDS]
    
#     while "" in tokens:
#         tokens.remove("")
#     while " " in tokens:
#         tokens.remove(" ")
#     while "\n" in tokens:
#         tokens.remove("\n")
#     while "\n\n" in tokens:
#         tokens.remove("\n\n")

#     return tokens

In [9]:
def append_vectors(row):
    sample = row[0]
    vector = tokenizeText(sample)
    size = len(vector)
    row['vector'] = vector
    #row['size'] = size
    return row


In [10]:
def featurize(dataset):
    dataset = dataset.apply(append_vectors, axis = 1)
    return dataset


In [11]:
training_data = featurize(TRAIN)
training_data.columns = ["text","intent","vector"]
training_data.head()

Unnamed: 0,text,intent,vector
0,You're awesome and I now have a crush on you. ...,reject,"[-0.0463381, 0.185636, -0.234928, -0.159231, 0..."
1,Just one example: I posted about the North/Sou...,reject,"[-0.0750384, 0.205054, -0.204887, -0.0879541, ..."
2,"Only in America, you nasty USA-centric jerk. I...",reject,"[-0.143458, 0.140976, -0.119074, -0.0459234, 0..."
3,Conversation between two people that have to u...,reject,"[-0.0275658, 0.214879, -0.287314, -0.0504264, ..."
4,-7:{D,reject,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [12]:
X_train = training_data['vector']
y_train = training_data['intent']

In [13]:
clf = LinearSVC()
X_train = np.array(list(X_train), dtype=np.float)

In [14]:
clf.fit(X_train, y_train)
#print(cross_val_score(clf,X_train,y_train))

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [15]:
def append_svc_response(row):
    utterance = row['vector']
    ip = utterance.reshape(1,-1)
    #ip = np.array(list(utterance),dtype = np.float)
    #ip = pd.Series(utterance)
    
    response = clf.predict(ip)[0]
    
    try:
        row['predicted'] = response
        """
        if response[2]:
            row['predicted_score'] = response[2]
        else:
            row['predicted_score'] = 0
        """
        
        
        #print("q = {}, ans = {}" .format(utterance,response))
            
    except Exception as e:
        row['predicted'] = None
        #row['predicted_score'] = 0
        print("q = {}, ans = {}" .format(utterance,None))
    
    return row

In [16]:
def label_correct_responses(row):
    expected = row[1]
    predicted = row['predicted']
    
    if expected == predicted:
        row['Correct'] = 1
    else:
        row['Correct'] = 0
    
    return row

In [17]:
def svc_test(test, write = False):
    test = test.apply(append_svc_response, axis = 1)
    test = test.apply(label_correct_responses, axis = 1)
    print(test['Correct'].mean())
    
    if write:
        test.to_csv('Word2vec with SVC.csv')
    
    return test


In [18]:
test_featurized = featurize(TEST)
#test_featurized

test_output = svc_test(test_featurized, False)
test_output.columns = ['utterance','expected intent','vector','predicted','Correct']
test_output

0.68


Unnamed: 0,utterance,expected intent,vector,predicted,Correct
0,Lets hope that the constitution is still consi...,reject,"[0.0252553, 0.107538, -0.0983352, -0.132523, 0...",reject,1
1,Who cares? Down vote.,reject,"[-0.101335, 0.111278, -0.000970999, -0.12211, ...",accept,0
2,"pretty sluts vs ugly sluts, woo hoo",accept,"[-0.330581, 0.091677, -0.126916, -0.106619, 0....",ignore,0
3,I'm the person OP asked. As are you. We're ent...,accept,"[-0.0799253, 0.168318, -0.198868, -0.156765, 0...",reject,0
4,"Do you even know what metric means, you moron?",accept,"[-0.153018, 0.301435, -0.289775, -0.0370332, 0...",accept,1
5,"This mother fucker is from new york city, not ...",ignore,"[0.00145229, 0.0568556, -0.165026, -0.109728, ...",reject,0
6,I TA'd for a bit in grad school. We did all of...,ignore,"[-0.0437662, 0.135699, -0.103565, -0.0333634, ...",reject,0
7,Snipers were made to be like shotguns dumbass....,accept,"[-0.132282, 0.0392806, -0.258326, -0.178421, 0...",accept,1
8,"Have another, you crybaby.",accept,"[-0.194266, 0.229285, -0.256602, -0.111172, 0....",accept,1
9,Upvote for writing REPUBLICANS ARE SUCH STUPID...,accept,"[-0.487342, -0.12598, -0.189224, -0.0917914, -...",accept,1


In [19]:
def predict(utterance):
    row = pd.DataFrame({'utterance':[utterance]})
    row = featurize(row)
    test_input = np.array(list(row['vector']), dtype=np.float)
    response = clf.predict(test_input)[0]
    return response

# r = pd.DataFrame({'utterance':[utterance]})
# r = featurize(r)
# #ip = pd.Series(r['vector'])
# ip = np.array(list(r['vector']), dtype=np.float)

# clf.predict(ip)
# #response = clf.predict(ip)[0]
# #response

In [20]:
test_output.to_csv('../datasets/Insults test(Word2vec with SVC).csv')

In [21]:
utterance = "Have you finished your work at last"
predict(utterance)

'reject'

In [22]:
training_data.to_csv('../datasets/Insults train(Word2vec with SVC).csv')