In [1]:
import pandas as pd
import numpy as np
import re
import difflib
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection, naive_bayes, svm
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
import warnings
warnings.filterwarnings("ignore")


In [2]:
#read the data file for product_information
data = pd.read_csv('Full+data.csv')
#data.head(5)

In [3]:
data.head(2)

Unnamed: 0,product_id,brand,mpn,name,description,brand_category,created_at,updated_at,deleted_at,brand_canonical_url,...,Unnamed: 246,Unnamed: 247,Unnamed: 248,Unnamed: 249,Unnamed: 250,Unnamed: 251,Unnamed: 252,Unnamed: 253,Unnamed: 254,Unnamed: 255
0,01DSRPSZTDW2PGK1YWYXJGKZZ0,FILA,400010319073,Original Fitness Sneakers,Vintage Fitness leather sneakers with logo pri...,TheMensStore/Shoes/Sneakers/LowTop,2019-11-15 23:36:38.98161+00,2019-12-19 20:40:30.786144+00,,https://www.saksfifthavenue.com/fila-original-...,...,,,,,,,,,,
1,01DSQXJBX0R7DCW7KTAC1SW547,CHANEL,400011497371,HAT,,Unknown,2019-11-15 16:15:34.809725+00,2019-12-19 20:40:30.786144+00,,https://www.saksfifthavenue.com/chanel-hat/pro...,...,,,,,,,,,,


In [5]:
#drop columns with all na's 
data = data.dropna(axis=1,how='all')

In [6]:
#Extra_information data
extra_data = pd.read_csv('extra_data.csv')
extra_data.shape

(6621, 14)

Extra_data and Full_data do not have the same columns. So, we subset the data for only important columns and the join the dataframes

In [7]:
#We are interested in these columns as they have the maximum information
cols = ['product_id', 'brand','description', 'brand_category', 'name','details']

In [8]:
extra_data = extra_data.loc[:,cols]
data = data.loc[:,cols]
print(extra_data.shape)
print(data.shape)

(6621, 6)
(42373, 6)


In [9]:
full_data = pd.concat([data,extra_data])
len(full_data) == len(data) + len(extra_data)

True

In [10]:
len(full_data)

48994

In [11]:
#removing duplicate product names
full_data.drop_duplicates(subset=['product_id'], keep="first",inplace = True)

In [12]:
#treating for na variables
full_data.fillna('UNKNOWNTOKEN',inplace=True)
full_data.isnull().sum()

product_id        0
brand             0
description       0
brand_category    0
name              0
details           0
dtype: int64

In [13]:
#clean the data for upper case
cols = full_data.columns[1:]
for col in cols:
    full_data[col] = full_data[col].str.lower()
full_data.head(2)

Unnamed: 0,product_id,brand,description,brand_category,name,details
0,01DSRPSZTDW2PGK1YWYXJGKZZ0,fila,vintage fitness leather sneakers with logo pri...,themensstore/shoes/sneakers/lowtop,original fitness sneakers,leather/synthetic upper\nlace-up closure\ntext...
1,01DSQXJBX0R7DCW7KTAC1SW547,chanel,unknowntoken,unknown,hat,wool tweed & felt


Now, we merge the all tags provided to us and join it with the product information

In [14]:
#read the tags
tags = pd.read_excel('USC+Product+Attribute+Data+03302020.xlsx')
tags.shape

(21925, 4)

In [15]:
#read additional tags
add_tags = pd.read_csv('usc_additional_tags.csv')
add_tags.shape

(97420, 4)

In [16]:
#join them together
all_tags = pd.concat([tags,add_tags])
len(all_tags) == len(tags) + len(add_tags)

True

In [17]:
#treat the tags for name = "Style"
all_tags = all_tags.loc[:,['product_id','attribute_name','attribute_value']]
all_tags.shape

(119345, 3)

subsetting the tags just for attribute_name = style. So, that when we merge back there is no multiple records

In [18]:
tags_style = all_tags[all_tags['attribute_name']=='style']
print(f'All :{tags_style.shape}')
tags_style = tags_style.drop_duplicates()
print(f'Clean:{tags_style.shape}')

All :(18335, 3)
Clean:(13357, 3)


In [19]:
#checking unique values. They should be 11
tags_style['attribute_value'].unique()

array(['Casual', 'Modern', 'Androgynous', 'Romantic', 'Boho',
       'Business Casual', 'Edgy', 'Glam', 'Classic', 'Athleisure',
       'Retro', 'modern', 'businesscasual', 'classic', 'glam', 'edgy',
       'casual', 'retro', 'boho', 'androgynous', 'romantic', 'athleisure'],
      dtype=object)

We see that naming is not consistent. So, we clean the data to make it same

In [20]:
tags_style['attribute_value'] = tags_style['attribute_value'].str.lower()
tags_style = tags_style.drop_duplicates()
print(f'Clean:{tags_style.shape}')

Clean:(10868, 3)


In [21]:
tags_style['attribute_value'].unique()

array(['casual', 'modern', 'androgynous', 'romantic', 'boho',
       'business casual', 'edgy', 'glam', 'classic', 'athleisure',
       'retro', 'businesscasual'], dtype=object)

In [22]:
#We see that business casual and business casual are messed up. So, we clean this

In [23]:
tags_style = tags_style.reset_index(drop = True)
for i in range(0,len(tags_style)):
    if(tags_style.loc[i,'attribute_value']=='businesscasual'):
        tags_style.loc[i,'attribute_value'] = 'business casual'

In [24]:
a = len(tags_style['product_id'].unique())
print(f'Total Unique product id tags for style are : {a}')

Total Unique product id tags for style are : 3916


In [25]:
product_tag_count = tags_style['product_id'].value_counts()

In [26]:
#printing number of products with just 1 tag
product_list = []
for i in product_tag_count.index:
    if(product_tag_count[i]==1):
        product_list.append(i)
print(f'product count with only one unique tag are :{len(product_list)}')

product count with only one unique tag are :233


In [27]:
#printing number of products with more than 1 tag
product_list = []
for i in product_tag_count.index:
    if(product_tag_count[i]>1):
        product_list.append(i)
print(f'product count with multiple unique tag are :{len(product_list)}')

product count with multiple unique tag are :3683


In [28]:
#as we can see there can be multiple value so we create flag for each type of product_tag

list_of_style = tags_style['attribute_value'].unique()

for j in list_of_style:
    for i in range(0,len(tags_style)):
        if(tags_style.loc[i,'attribute_value'] == j):
            tags_style.loc[i,'is_'+j] = 1
        else:
            tags_style.loc[i,'is_'+j] = 0

In [29]:
tags_style.head(5)

Unnamed: 0,product_id,attribute_name,attribute_value,is_casual,is_modern,is_androgynous,is_romantic,is_boho,is_business casual,is_edgy,is_glam,is_classic,is_athleisure,is_retro
0,01DPGV4YRP3Z8J85DASGZ1Y99W,style,casual,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,01E1JM43NQ3H17PB22EV3074NX,style,modern,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,01DT0DJ6GQNF86VZ1EAP047SVC,style,modern,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,01DPH1DEN9G2WM7WAMJMD0A9W4,style,casual,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,01DS3SKHPXXH6AN4362MZYYQAT,style,androgynous,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
#now we create 11 different tag df with respective style and remove duplicates
#casual
casual = tags_style[['product_id','is_casual']]
tags_casual = casual.groupby(['product_id'])['is_casual'].max().reset_index()
#modern
modern = tags_style[['product_id','is_modern']]
tags_modern = modern.groupby(['product_id'])['is_modern'].max().reset_index()
#androgynous
androgynous = tags_style[['product_id','is_androgynous']]
tags_androgynous = androgynous.groupby(['product_id'])['is_androgynous'].max().reset_index()
#romantic
romantic = tags_style[['product_id','is_romantic']]
tags_romantic = romantic.groupby(['product_id'])['is_romantic'].max().reset_index()
#boho
boho = tags_style[['product_id','is_boho']]
tags_boho = boho.groupby(['product_id'])['is_boho'].max().reset_index()
#business casual
business_casual = tags_style[['product_id','is_business casual']]
tags_business_casual = business_casual.groupby(['product_id'])['is_business casual'].max().reset_index()
#edgy
edgy = tags_style[['product_id','is_edgy']]
tags_edgy = edgy.groupby(['product_id'])['is_edgy'].max().reset_index()
#glam
glam = tags_style[['product_id','is_glam']]
tags_glam = glam.groupby(['product_id'])['is_glam'].max().reset_index()
#classic
classic = tags_style[['product_id','is_classic']]
tags_classic = classic.groupby(['product_id'])['is_classic'].max().reset_index()
#athleisure
athleisure = tags_style[['product_id','is_athleisure']]
tags_athleisure = athleisure.groupby(['product_id'])['is_athleisure'].max().reset_index()
#retro
retro = tags_style[['product_id','is_retro']]
tags_retro = retro.groupby(['product_id'])['is_retro'].max().reset_index()

In [31]:
### Now, we need to add different tags one final tag df
df_list = [tags_modern,tags_androgynous,tags_romantic,tags_boho,tags_business_casual,tags_edgy,tags_glam,tags_classic,tags_athleisure,tags_retro]
for df_ in df_list:
    tags_casual = pd.merge(tags_casual, df_, on='product_id', how='left')
tags_style_all = tags_casual

In [32]:
#tags_style_all.to_csv('style.csv')

In [33]:
len(tags_style_all['product_id'].unique())

3916

Now, we go back to cleaning our actual data. Let's merge tags_style_all and all_data

In [34]:
data_label = pd.merge(full_data,tags_style_all,on='product_id', how = 'inner')

In [35]:
#first let's remove the basic stop words from the dataset
from gensim.parsing.preprocessing import STOPWORDS
stop = set(STOPWORDS)
def remove_stopwords(data_col):
    new_list = []
    a = data_col
    for i in range(0,len(a)):
        words = word_tokenize(a[i])
        res_words = []
        for word in words:
            if word not in stop:
                res_words.append(word)
            sentence = " ".join(res_words)
        new_list.append(sentence)
    return new_list

In [36]:
#clean the data for stopwords for every columns
cols = ['brand','description', 'brand_category', 'name','details']
for col in cols:
    data_label[col] = remove_stopwords(data_label[col])
data_label.head(2)

Unnamed: 0,product_id,brand,description,brand_category,name,details,is_casual,is_modern,is_androgynous,is_romantic,is_boho,is_business casual,is_edgy,is_glam,is_classic,is_athleisure,is_retro
0,01DTJCERF6F4NRZ2WSJFFA1EYS,theory,"beige stretch-silk slips 93 % silk , 7 % spand...",clothing / tops / tanks camis,teah stretch-silk camisole,"fits true size , normal size cut slightly loos...",1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
1,01DVPBJ6464YKYGVAE0A1HMKGN,alexander wang,black velvet concealed hook zip fastening 65 %...,clothing / dresses / mini,layered velvet mini dress,"fits true size , normal size designed fitted b...",0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [37]:
#first let's clean description based on some rules
#clean the data using regex
def reg_clean(data,col):
    new_list = []
    for i in range(0,len(data)):
        #special characters 
        a = re.sub(r'[^ a-zA-Z0-9]','',data.loc[i,col])
        #remove multiple spaces by a single space
        a = re.sub(r'\s+',' ',a)
        #timestamp
        a = re.sub(r'\b[0-9]{1,}am|[0-9]{1,}pm|[0-9]{4,}|[0-9]ish|1st|2nd|3rd|[0-9]{1,2}th|31st|[0-9]{1,}min(?:utes)?s?|[0-9]{1,}h(?:ou)?rs?|[0-9]{3,}\b','timestamp',a)
        a = re.sub(r'\b[0-9]{1,}timestamp\b','timestamp',a)
        #any numbers as digit
        a = re.sub(r'\b\d{1,}\b','digit',a)
        #number followed by a variable
        a = re.sub(r'\b\d{1,}[a-z]{0,}[0-9]{0,}','varchar',a)
        #html codes
        a = re.sub(r'<.+?>','html',a)
        a = re.sub(r'https|www','html',a)
        new_list.append(a)
    return new_list

In [40]:
#clean the data for regex for every column
cols = ['brand','description', 'brand_category', 'name','details']
for col in cols:
    data_label[col] = reg_clean(data_label,col)
data_label.head(2)

Unnamed: 0,product_id,brand,description,brand_category,name,details,is_casual,is_modern,is_androgynous,is_romantic,is_boho,is_business casual,is_edgy,is_glam,is_classic,is_athleisure,is_retro
0,01DTJCERF6F4NRZ2WSJFFA1EYS,theory,beige stretchsilk slips digit silk digit spand...,clothing tops tanks camis,teah stretchsilk camisole,fits true size normal size cut slightly loose ...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
1,01DVPBJ6464YKYGVAE0A1HMKGN,alexander wang,black velvet concealed hook zip fastening digi...,clothing dresses mini,layered velvet mini dress,fits true size normal size designed fitted bus...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [41]:
words = set(map(lambda word: word.replace("\n", ""), open("20k.txt").readlines()))
words.add('unknowntoken')
def spellcheck_document(text):
    new_tokens = []
    for token in word_tokenize(text):
        matches = difflib.get_close_matches(token, words, n=1, cutoff=0.7)
        if len(matches) == 0 or token.lower() in words:
            new_tokens.append(token)
        else:
            new_tokens.append(matches[0])
    return " ".join(new_tokens)

Spellcheck is taking forever to run. Hence, commented this section

In [42]:
#spell check the whole document apart from product_id we have added unknown_text as well so that it does not return unknown
#cols = data_label.columns[1:4]
#for col in cols:
#    new_list = []
#    for i in range(0,len(data_label)):
#        new_list.append(spellcheck_document(data_label.loc[i,col]))
#    data_label.col = new_list
#    print(col)
#data_label.head(5)

In [43]:
#lemmatize
def lemmatize_sentence(data_col):
    new_list = []
    a = data_col 
    for i in range(0,len(a)):
        words = word_tokenize(a[i])
        res_words = []
        for word in words:
            res_words.append(lemmatizer.lemmatize(word).strip(string.punctuation))
        sentence = " ".join(res_words)
        new_list.append(sentence)
    return new_list

In [44]:
#clean the data for stopwords for every columns
cols = ['brand','description', 'brand_category', 'name','details']
for col in cols:
    data_label[col] = lemmatize_sentence(data_label[col])
data_label.head(2)

Unnamed: 0,product_id,brand,description,brand_category,name,details,is_casual,is_modern,is_androgynous,is_romantic,is_boho,is_business casual,is_edgy,is_glam,is_classic,is_athleisure,is_retro
0,01DTJCERF6F4NRZ2WSJFFA1EYS,theory,beige stretchsilk slip digit silk digit spande...,clothing top tank camis,teah stretchsilk camisole,fit true size normal size cut slightly loose f...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
1,01DVPBJ6464YKYGVAE0A1HMKGN,alexander wang,black velvet concealed hook zip fastening digi...,clothing dress mini,layered velvet mini dress,fit true size normal size designed fitted bust...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


### Vectorize the data

In [45]:
#here we use tfidf to vectorize the data 
#we can change the parameters in this code cell
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
vectorizer = TfidfVectorizer(token_pattern=r'\b[a-zA-Z]{3,}\b',stop_words="english",binary = True,min_df = 0.005,max_df = 0.7,max_features =300)

In [47]:
#apppend the model_data with feature for each column
columns = ['brand', 'description', 'brand_category', 'name','details']
model_data=pd.DataFrame()
for j in columns:
    corpus = []
    for i in range(0,len(data_label)):
        corpus.append(data_label.loc[i,j])
    vect = vectorizer.fit_transform(corpus)
    terms = vectorizer.get_feature_names()
    c=pd.DataFrame(vect.toarray().transpose(), index=terms)
    model_data=pd.concat([model_data,c.T],axis = 1)
print(f'The Dimensionality of the data is:{model_data.shape}')

The Dimensionality of the data is:(3916, 801)


In [48]:
#Split X and Y for model building phase using 0.2 test_size and random_state = 0 to for repeatability of the code
X=model_data
y=data_label['is_casual'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [49]:
data_label['is_casual'].sum()/len(data_label)

0.6716036772216547

## Neural Network

In [50]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
size=[[64,8,2],[128,32,4]]
epoch=[5,8,10,20]
for i in size:
    for k in epoch:
        nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=i,max_iter=k).fit(X_train, y_train)
        y_pred_RF = nn.predict(X_test)
        print(accuracy_score(y_pred_RF,y_test))

0.6709183673469388
0.6721938775510204
0.6709183673469388
0.8022959183673469
0.6709183673469388
0.7321428571428571
0.7295918367346939
0.735969387755102


In [51]:
#check for other categories of style
X=model_data
y=data_label['is_modern'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
data_label['is_modern'].sum()/len(data_label)

0.46833503575076607

In [52]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
size=[[64,8,2]]
epoch=[5,8,10,20]
for i in size:
    for k in epoch:
        nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=i,max_iter=k).fit(X_train, y_train)
        y_pred_RF = nn.predict(X_test)
        print(accuracy_score(y_pred_RF,y_test))

0.5025510204081632
0.6326530612244898
0.7142857142857143
0.7346938775510204


In [53]:
#check for category with low 1's
X=model_data
y=data_label['is_edgy'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
data_label['is_edgy'].sum()/len(data_label)

0.20531154239019409

In [54]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
size=[[64,8,2],[128,16,4],[256,64,2],[32,4,2]]
epoch=[5,8,10,20]
for i in size:
    for k in epoch:
        nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=i,max_iter=k).fit(X_train, y_train)
        y_pred_RF = nn.predict(X_test)
        print(accuracy_score(y_pred_RF,y_test))

0.7844387755102041
0.7844387755102041
0.7844387755102041
0.8188775510204082
0.7844387755102041
0.7844387755102041
0.7844387755102041
0.8278061224489796
0.7844387755102041
0.7844387755102041
0.7844387755102041
0.8086734693877551
0.7844387755102041
0.7844387755102041
0.7844387755102041
0.8227040816326531


### Neural Network is performing better than normal but it's not very good

### Naive Bayes

In [55]:
#train-test split for naive bayes. Keeping the random_State = 0 for reporducibility
X=model_data
y=data_label['is_modern'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


0.46833503575076607

In [56]:
#defining Gaussian classifier
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [60]:
data_label['is_modern'].sum()

1834.0

In [65]:
print(f'random is :{data_label.is_modern.sum()/len(data_label)}')
y_pred_NB = classifier.predict(X_test)
print(f'predicted is : {accuracy_score(y_pred_NB,y_test)}')

random is :0.46833503575076607
predicted is : 0.6951530612244898


In [66]:
X=model_data
y=data_label['is_casual'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [68]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)
print(f'random is :{data_label.is_casual.sum()/len(data_label)}')
y_pred_NB = classifier.predict(X_test)
accuracy_score(y_pred_NB,y_test)

random is :0.6716036772216547


0.6823979591836735

In [69]:
X=model_data
y=data_label['is_edgy'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
data_label['is_edgy'].sum()/len(data_label)

0.20531154239019409

In [70]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred_NB = classifier.predict(X_test)
accuracy_score(y_pred_NB,y_test)

0.49489795918367346

### Naive bayes classifier is also struggling with smaller percentages of 1. But, it's predicting not overfitting like neural network

### SVM

In [71]:
X=model_data
y=data_label['is_edgy'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
data_label['is_edgy'].sum()/len(data_label)

0.20531154239019409

In [72]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train,y_train)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(X_test)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, y_test)*100)

SVM Accuracy Score ->  81.63265306122449


In [73]:
X=model_data
y=data_label['is_casual'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
data_label['is_casual'].sum()/len(data_label)

0.6716036772216547

In [74]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train,y_train)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(X_test)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, y_test)*100)

SVM Accuracy Score ->  81.88775510204081


In [75]:
X=model_data
y=data_label['is_modern'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
data_label['is_modern'].sum()/len(data_label)

0.46833503575076607

In [76]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train,y_train)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(X_test)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, y_test)*100)

SVM Accuracy Score ->  73.08673469387756


### Logistic

In [77]:
X=model_data
y=data_label['is_modern'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
data_label['is_modern'].sum()/len(data_label)

0.46833503575076607

In [78]:
from sklearn.linear_model import LogisticRegression

logreg=LogisticRegression(n_jobs=1, C=1e5)
               
logreg.fit(X_train, y_train)


y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.6926020408163265


In [79]:
X=model_data
y=data_label['is_casual'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
data_label['is_casual'].sum()/len(data_label)

0.6716036772216547

In [80]:
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.7653061224489796


In [81]:
X=model_data
y=data_label['is_edgy'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
data_label['is_edgy'].sum()/len(data_label)

0.20531154239019409

In [82]:
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.75


### SVM is performing the best among these models. Even logistic is doing decently well mostly because of sparse data and less data