### In this code before running the TF-IDF we look for collocations for different columns,add this collocations, then run the tf-idf vectorizer and then we have implemented multiple N-grams

## Also, as best model in this code we are using this model to print the output

In [1]:
import pandas as pd
import numpy as np
import re
import difflib
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection, naive_bayes, svm
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
import warnings
warnings.filterwarnings("ignore")

In [2]:
#input the data columns
data = pd.read_csv('Full+data.csv')
data.shape

(42373, 256)

In [3]:
data = data.dropna(axis = 1,how ="all")

In [4]:
extra_data = pd.read_csv('extra_data.csv')
extra_data.shape

(6621, 14)

In [5]:
#In this version we are not including brand_url
cols = ['product_id', 'brand','description', 'brand_category', 'name','details']

In [6]:
extra_data = extra_data.loc[:,cols]
data = data.loc[:,cols]
print(extra_data.shape)
print(data.shape)

(6621, 6)
(42373, 6)


In [7]:
full_data = pd.concat([data,extra_data])
len(full_data) == len(data) + len(extra_data)

True

In [8]:
len(full_data)

48994

In [9]:
#removing duplicate product names
full_data.drop_duplicates(subset=['product_id'], keep="first",inplace = True)
len(full_data)

48087

In [10]:
#treating for na variables
full_data.fillna('UNKNOWNTOKEN',inplace=True)
full_data.isnull().sum()

product_id        0
brand             0
description       0
brand_category    0
name              0
details           0
dtype: int64

In [11]:
#clean the data for upper case
cols = full_data.columns[1:]
for col in cols:
    full_data[col] = full_data[col].str.lower()
full_data.head(2)

Unnamed: 0,product_id,brand,description,brand_category,name,details
0,01DSRPSZTDW2PGK1YWYXJGKZZ0,fila,vintage fitness leather sneakers with logo pri...,themensstore/shoes/sneakers/lowtop,original fitness sneakers,leather/synthetic upper\nlace-up closure\ntext...
1,01DSQXJBX0R7DCW7KTAC1SW547,chanel,unknowntoken,unknown,hat,wool tweed & felt


In [11]:
#read the clean tagged data 

tags_style_all = pd.read_csv('style.csv',index_col = 0)

In [12]:
data_label = pd.merge(full_data,tags_style_all,on='product_id', how = 'inner')

In [14]:
data_label.to_csv('data_label.csv')

## Cleaning the data to reduce dimensionality

In [14]:
#first let's remove the basic stop words from the dataset
from gensim.parsing.preprocessing import STOPWORDS
stop = set(STOPWORDS)
def remove_stopwords(data_col):
    new_list = []
    a = data_col
    for i in range(0,len(a)):
        words = word_tokenize(a[i])
        res_words = []
        for word in words:
            if word not in stop:
                res_words.append(word)
            sentence = " ".join(res_words)
        new_list.append(sentence)
    return new_list

In [15]:
cols = ['brand', 'description', 'brand_category', 'name','details']
for col in cols:
    data_label[col] = remove_stopwords(data_label[col])
#data_label.head(5)

In [16]:
#first let's clean description based on some rules
#clean the data using regex
def reg_clean(data,col):
    new_list = []
    for i in range(0,len(data)):
        #special characters 
        a = re.sub(r'[^ a-zA-Z0-9]','',data.loc[i,col])
        #remove multiple spaces by a single space
        a = re.sub(r'\s+',' ',a)
        #timestamp
        a = re.sub(r'\b[0-9]{1,}am|[0-9]{1,}pm|[0-9]{4,}|[0-9]ish|1st|2nd|3rd|[0-9]{1,2}th|31st|[0-9]{1,}min(?:utes)?s?|[0-9]{1,}h(?:ou)?rs?|[0-9]{3,}\b','timestamp',a)
        a = re.sub(r'\b[0-9]{1,}timestamp\b','timestamp',a)
        #any numbers as digit
        a = re.sub(r'\b\d{1,}\b','digit',a)
        #number followed by a variable
        a = re.sub(r'\b\d{1,}[a-z]{0,}[0-9]{0,}','varchar',a)
        #html codes
        a = re.sub(r'<.+?>','html',a)
        a = re.sub(r'https|www','html',a)
        new_list.append(a)
    return new_list

In [17]:
cols = ['brand', 'description', 'brand_category', 'name','details']
for col in cols:
    data_label[col] = reg_clean(data_label,col)
#data_label.head(5)

In [18]:
#lemmatize
def lemmatize_sentence(data_col):
    new_list = []
    a = data_col 
    for i in range(0,len(a)):
        words = word_tokenize(a[i])
        res_words = []
        for word in words:
            res_words.append(lemmatizer.lemmatize(word).strip(string.punctuation))
        sentence = " ".join(res_words)
        new_list.append(sentence)
    return new_list

In [19]:
cols = ['brand', 'description', 'brand_category', 'name','details']
for col in cols:
    data_label[col] = lemmatize_sentence(data_label[col])
#data_label.head(5)

In [20]:
data_label.loc[607,:]

product_id                                   01DVME78NARMF9G6H5HTPG72Q6
brand                                                                  
description           offwhite satin lightbeige crepe chine partiall...
brand_category                                       clothing top shirt
name                                    paneled satin crepe chine shirt
details               fit true size normal size cut loose fit midwei...
is_casual                                                             0
is_modern                                                             1
is_androgynous                                                        1
is_romantic                                                           0
is_boho                                                               0
is_business casual                                                    1
is_edgy                                                               0
is_glam                                                         

In [21]:
### Finding Collocations

In [22]:
stopwords_coll = set(stopwords.words('english') + [".",'.', ",",":", "''", "'s", "'", "``", "(", ")", "-","timestamp","varchar","html","digit"])
filter_stops = lambda w: len(w) < 3 or w in stopwords_coll
def collocation_list(data_col):
    new_list = []
    for i in range(0,len(data_col)):
        words = word_tokenize(data_col[i])
        res_words = []
        for word in words:
            if word not in stopwords_coll:
                res_words.append(word)
        new_list.append(res_words)
    return(new_list)

In [23]:
#creates n-list of n size n = no of columns
cols = ['brand', 'description', 'brand_category', 'name','details']
n_list = []
for col in cols:
    n_list.append(collocation_list(data_label[col]))

In [24]:
# obtain top 25 collocations by raw frequency Change the value of n from 0, length 5 for different columns
collocation_finder = BigramCollocationFinder.from_documents(n_list[0])
measures = BigramAssocMeasures()
collocation_finder.apply_word_filter(filter_stops)
collocation_finder.nbest(BigramAssocMeasures.likelihood_ratio, 25)

[('nili', 'lotan'),
 ('anine', 'bing'),
 ('tory', 'burch'),
 ('sam', 'edelman'),
 ('anthony', 'thomas'),
 ('atm', 'anthony'),
 ('thomas', 'melillo'),
 ('ulla', 'johnson'),
 ('mansur', 'gavriel'),
 ('franco', 'sarto'),
 ('citizen', 'humanity'),
 ('sarto', 'franco'),
 ('charles', 'david'),
 ('zadig', 'voltaire'),
 ('banana', 'republic'),
 ('brochu', 'walker'),
 ('veronica', 'beard'),
 ('alexandre', 'birman'),
 ('aleksandre', 'akhalkatsishvili'),
 ('isabel', 'marant'),
 ('common', 'project'),
 ('jimmy', 'choo'),
 ('jenni', 'kayne'),
 ('angela', 'scott'),
 ('gianvito', 'rossi')]

In [25]:
#As we see most of these words are supposed to be used together
#So, we collate them
#This is collation of brands
data_col = data_label['brand']
new_list = []
for i in range(0,len(data_col)):
    words = word_tokenize(data_col[i])
    res_word = []
    for word in words:
        res_word.append(word)
    sentence = "_".join(res_word)
    new_list.append(sentence)
data_label['brand'] = new_list

In [26]:
#now, we look at description
collocation_finder = BigramCollocationFinder.from_documents(n_list[1])
measures = BigramAssocMeasures()
collocation_finder.apply_word_filter(filter_stops)
#collocation_finder.nbest(BigramAssocMeasures.likelihood_ratio, 25)

In [29]:
# we find that dry clean, high waist, pointed toe should be collated together
#change col_list and data_col to alter for your dataset
col_list = [['dry', 'clean'],['high','waist'],['pointed','toe']]
data_col = data_label['description']
new_list = []
clean_list = ['clean','waist','toe']
for i in range(0,len(data_col)):
    words = word_tokenize(data_col[i])
    len_words = len(words)-1
    for i in range(0,len_words):
        bi_word = []
        j = i+1
        bi_word.append(words[i])
        bi_word.append(words[j])
        if(bi_word in col_list):
            sentence = "_".join(bi_word)
            words[i] = sentence
    for word in words:
        if word in clean_list:
            words.remove(word)
    res_word = []
    for word in words:
        res_word.append(word)
    sentence = " ".join(res_word)
    new_list.append(sentence)
data_label['description'] = new_list

In [30]:
data_label['description'][0]

'beige stretchsilk slip digit silk digit spandex dry_clean imported'

In [31]:
#now, we look at brand_category
collocation_finder = BigramCollocationFinder.from_documents(n_list[2])
measures = BigramAssocMeasures()
collocation_finder.apply_word_filter(filter_stops)
collocation_finder.nbest(BigramAssocMeasures.likelihood_ratio, 25)

[('woman', 'clothing'),
 ('clothing', 'top'),
 ('mid', 'heel'),
 ('boot', 'ankle'),
 ('shoe', 'pump'),
 ('shoe', 'boot'),
 ('bag', 'shoulder'),
 ('shoulder', 'bag'),
 ('clothing', 'dress'),
 ('shoe', 'sandal'),
 ('clothing', 'pant'),
 ('pump', 'mid'),
 ('top', 'blouse'),
 ('skirt', 'short'),
 ('straight', 'leg'),
 ('bag', 'tote'),
 ('medium', 'knit'),
 ('wide', 'leg'),
 ('knitwear', 'medium'),
 ('apr', 'sport'),
 ('sport', 'apr'),
 ('clothing', 'skirt'),
 ('high', 'heel'),
 ('woman', 'shoe'),
 ('tank', 'camis')]

In [32]:
# we find that mid heel, boot ankle, shoe pump, shoe boot, shoulder bag, shoe sandal, straight leg, 
# top blouse, wide leg, high heel should be collated together
col_list = [['mid', 'heel'],['boot','ankle'],['shoe','pump'],['shoe','boot'],['shoulder','bag'],
            ['shoe','sandal'],['straight','leg'],['top','blouse'],['wide','leg'],['high','heel']]
data_col = data_label['brand_category']
new_list = []
clean_list = ['heel','ankle','pump','bag','sandal','leg','blouse']
for i in range(0,len(data_col)):
    words = word_tokenize(data_col[i])
    len_words = len(words)-1
    for i in range(0,len_words):
        bi_word = []
        j = i+1
        bi_word.append(words[i])
        bi_word.append(words[j])
        if(bi_word in col_list):
            sentence = "_".join(bi_word)
            words[i] = sentence
    for word in words:
        if word in clean_list:
            words.remove(word)
    res_word = []
    for word in words:
        res_word.append(word)
    sentence = " ".join(res_word)
    new_list.append(sentence)
data_label['brand_category'] = new_list

In [54]:
#now, we look at name
collocation_finder = BigramCollocationFinder.from_documents(n_list[3])
measures = BigramAssocMeasures()
collocation_finder.apply_word_filter(filter_stops)
top25_list = collocation_finder.nbest(BigramAssocMeasures.likelihood_ratio, 25)
top25_list.remove(('leg', 'jean'))
top25_list.remove(('genuine', 'calf'))

In [55]:
col_list = []
for i in range(0,len(top25_list)):
    a = top25_list[i][0]
    b = top25_list[i][1]
    col_list.append([a,b])
clean_list = []
for i in range(0,len(col_list)):
    clean_list.append(col_list[i][1])

In [57]:
clean_list

['waist',
 'sleeve',
 'jean',
 'sandal',
 'hair',
 'dress',
 'bag',
 'leg',
 'leg',
 'max',
 'bag',
 'print',
 'toe',
 'embossed',
 'skirt',
 'sweater',
 'cotton',
 'neck',
 'skinny',
 'blouse',
 'dress',
 'strap',
 'boot']

In [35]:
data_col = data_label['name']
new_list = []
for i in range(0,len(data_col)):
    words = word_tokenize(data_col[i])
    len_words = len(words)-1
    for i in range(0,len_words):
        bi_word = []
        j = i+1
        bi_word.append(words[i])
        bi_word.append(words[j])
        if(bi_word in col_list):
            sentence = "_".join(bi_word)
            words[i] = sentence
    for word in words:
        if word in clean_list:
            words.remove(word)
    res_word = []
    for word in words:
        res_word.append(word)
    sentence = " ".join(res_word)
    new_list.append(sentence)
data_label['name'] = new_list

In [36]:
#now, we look at name
collocation_finder = BigramCollocationFinder.from_documents(n_list[4])
measures = BigramAssocMeasures()
collocation_finder.apply_word_filter(filter_stops)
top25_list = collocation_finder.nbest(BigramAssocMeasures.likelihood_ratio, 25)
#top25_list

In [37]:
# we find that mid heel, boot ankle, shoe pump, shoe boot, shoulder bag, shoe sandal, straight leg, 
# top blouse, wide leg, high heel should be collated together
col_list = [['true', 'size'],['dry','clean'],['machine','wash'],['tumble','dry'],['hand','wash'],
            ['long','sleeve'],['high','rise']]
data_col = data_label['details']
new_list = []
clean_list = ['size','clean','wash','dry','sleeve','rise']
for i in range(0,len(data_col)):
    words = word_tokenize(data_col[i])
    len_words = len(words)-1
    for i in range(0,len_words):
        bi_word = []
        j = i+1
        bi_word.append(words[i])
        bi_word.append(words[j])
        if(bi_word in col_list):
            sentence = "_".join(bi_word)
            words[i] = sentence
    for word in words:
        if word in clean_list:
            words.remove(word)
    res_word = []
    for word in words:
        res_word.append(word)
    sentence = " ".join(res_word)
    new_list.append(sentence)
data_label['details'] = new_list

In [38]:
col_list = [['true', 'size'],['dry','clean'],['machine','wash'],['tumble','dry'],['hand','wash'],
            ['long','sleeve'],['high','rise'],['fit','true_size']]
data_col = data_label['details']
new_list = []
clean_list = ['size','clean','wash','dry','sleeve','rise','true_size']
for i in range(0,len(data_col)):
    words = word_tokenize(data_col[i])
    len_words = len(words)-1
    for i in range(0,len_words):
        bi_word = []
        j = i+1
        bi_word.append(words[i])
        bi_word.append(words[j])
        if(bi_word in col_list):
            sentence = "_".join(bi_word)
            words[i] = sentence
    for word in words:
        if word in clean_list:
            words.remove(word)
    res_word = []
    for word in words:
        res_word.append(word)
    sentence = " ".join(res_word)
    new_list.append(sentence)
data_label['details'] = new_list

In [39]:
#data_label.to_csv('data_label.csv')

##### Running the above two times to add for fit_true_size which occurs a lot of times. It's a hack but works well

## We need to vectorize our data. 

### TFIDF

In [40]:
vectorizer = TfidfVectorizer(token_pattern=r'\b[a-zA-Z]{3,}\b',stop_words="english",min_df = 0.005,max_df = 0.7)

In [41]:
columns = ['brand', 'description', 'brand_category', 'name','details']
model_data=pd.DataFrame()
for j in columns:
    corpus = []
    for i in range(0,len(data_label)):
        corpus.append(data_label.loc[i,j])
    vect = vectorizer.fit_transform(corpus)
    terms = vectorizer.get_feature_names()
    c=pd.DataFrame(vect.toarray().transpose(), index=terms)
    model_data=pd.concat([model_data,c.T],axis = 1)
print(f'The Dimensionality of the data is:{model_data.shape}')

The Dimensionality of the data is:(3916, 982)


## N-gram tf-idf model
#N = 2,3,(1,3),(2,3)

In [42]:
#Try different N values
vectorizer = TfidfVectorizer(ngram_range=(2,2),token_pattern=r'\b[a-zA-Z]{3,}\b',stop_words = "english",min_df = 1,max_df = 0.7,max_features = 300)

In [43]:
columns = ['description', 'brand_category', 'name','details']
bi_model_data=pd.DataFrame()
for j in columns:
    corpus = []
    for i in range(0,len(data_label)):
        corpus.append(data_label.loc[i,j])
    vect = vectorizer.fit_transform(corpus)
    terms = vectorizer.get_feature_names()
    c=pd.DataFrame(vect.toarray().transpose(), index=terms)
    bi_model_data=pd.concat([bi_model_data,c.T],axis = 1)
print(f'The Dimensionality of the data is:{bi_model_data.shape}')

The Dimensionality of the data is:(3916, 1064)


In [44]:
#Try different N values
vectorizer = TfidfVectorizer(ngram_range=(3,3),token_pattern=r'\b[a-zA-Z]{3,}\b',stop_words="english",max_df = 0.7,max_features = 300)

In [45]:
columns = ['description', 'brand_category', 'name','details']
tri_model_data=pd.DataFrame()
for j in columns:
    corpus = []
    for i in range(0,len(data_label)):
        corpus.append(data_label.loc[i,j])
    vect = vectorizer.fit_transform(corpus)
    terms = vectorizer.get_feature_names()
    c=pd.DataFrame(vect.toarray().transpose(), index=terms)
    tri_model_data=pd.concat([tri_model_data,c.T],axis = 1)
print(f'The Dimensionality of the data is:{tri_model_data.shape}')

The Dimensionality of the data is:(3916, 992)


In [46]:
#Try different N values
vectorizer = TfidfVectorizer(ngram_range=(1,3),token_pattern=r'\b[a-zA-Z]{3,}\b',stop_words="english",min_df = 10,max_df = 0.7,max_features = 300)

In [47]:
columns = ['brand', 'description', 'brand_category', 'name','details']
obi_tri_model_data=pd.DataFrame()
for j in columns:
    corpus = []
    for i in range(0,len(data_label)):
        corpus.append(data_label.loc[i,j])
    vect = vectorizer.fit_transform(corpus)
    terms = vectorizer.get_feature_names()
    c=pd.DataFrame(vect.toarray().transpose(), index=terms)
    obi_tri_model_data=pd.concat([obi_tri_model_data,c.T],axis = 1)
print(f'The Dimensionality of the data is:{obi_tri_model_data.shape}')

The Dimensionality of the data is:(3916, 914)


In [48]:
#Try different N values
vectorizer = TfidfVectorizer(ngram_range=(2,3),token_pattern=r'\b[a-zA-Z]{3,}\b',stop_words=stop,min_df = 10,max_df = 0.7,max_features = 300)

In [49]:
columns = ['description', 'brand_category', 'name','details']
bi_tri_model_data=pd.DataFrame()
for j in columns:
    corpus = []
    for i in range(0,len(data_label)):
        corpus.append(data_label.loc[i,j])
    vect = vectorizer.fit_transform(corpus)
    terms = vectorizer.get_feature_names()
    c=pd.DataFrame(vect.toarray().transpose(), index=terms)
    bi_tri_model_data=pd.concat([bi_tri_model_data,c.T],axis = 1)
print(f'The Dimensionality of the data is:{bi_tri_model_data.shape}')

The Dimensionality of the data is:(3916, 656)


In [50]:
#change X = bi_model_data or model_data to see how different results are changing you can do this for all the different models in other file 
# Also, use stratify = Y in all other options for train-test splitting as well
x_list = [model_data,bi_model_data,tri_model_data,obi_tri_model_data,bi_tri_model_data]
nval_list = [(1,1),(2,2),(3,3),(1,3),(2,3)]
for i in range(0,len(x_list)):
    X=x_list[i]
    y=data_label['is_edgy'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0,stratify = y)
    rand = data_label['is_edgy'].sum()/len(data_label)
    print(f'random is :{max(rand,1-rand)*100}')
    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(X_train,y_train)
    # predict the labels on validation dataset
    predictions_SVM = SVM.predict(X_test)
    # Use accuracy_score function to get the accuracy
    print(f'SVM Accuracy Score for n-gram {nval_list[i]}:{accuracy_score(predictions_SVM, y_test)*100}')

random is :79.46884576098059
SVM Accuracy Score for n-gram (1, 1):82.78061224489795
random is :79.46884576098059
SVM Accuracy Score for n-gram (2, 2):81.63265306122449
random is :79.46884576098059
SVM Accuracy Score for n-gram (3, 3):82.2704081632653
random is :79.46884576098059
SVM Accuracy Score for n-gram (1, 3):85.45918367346938
random is :79.46884576098059
SVM Accuracy Score for n-gram (2, 3):79.97448979591837


In [51]:
for i in range(0,len(x_list)):
    X=x_list[i]
    y=data_label['is_casual'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0,stratify = y)
    rand = data_label['is_casual'].sum()/len(data_label)
    print(f'random is :{max(rand,1-rand)*100}')
    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(X_train,y_train)
    # predict the labels on validation dataset
    predictions_SVM = SVM.predict(X_test)
    # Use accuracy_score function to get the accuracy
    print(f'SVM Accuracy Score for n-gram {nval_list[i]}:{accuracy_score(predictions_SVM, y_test)*100}')

random is :67.16036772216547
SVM Accuracy Score for n-gram (1, 1):81.50510204081633
random is :67.16036772216547
SVM Accuracy Score for n-gram (2, 2):76.91326530612244
random is :67.16036772216547
SVM Accuracy Score for n-gram (3, 3):73.72448979591837
random is :67.16036772216547
SVM Accuracy Score for n-gram (1, 3):79.20918367346938
random is :67.16036772216547
SVM Accuracy Score for n-gram (2, 3):75.63775510204081


In [52]:
for i in range(0,len(x_list)):
    X=x_list[i]
    y=data_label['is_modern'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0,stratify = y)
    rand = data_label['is_modern'].sum()/len(data_label)
    print(f'random is :{max(rand,1-rand)*100}')
    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(X_train,y_train)
    # predict the labels on validation dataset
    predictions_SVM = SVM.predict(X_test)
    # Use accuracy_score function to get the accuracy
    print(f'SVM Accuracy Score for n-gram {nval_list[i]}:{accuracy_score(predictions_SVM, y_test)*100}')

random is :53.166496424923395
SVM Accuracy Score for n-gram (1, 1):72.95918367346938
random is :53.166496424923395
SVM Accuracy Score for n-gram (2, 2):68.49489795918367
random is :53.166496424923395
SVM Accuracy Score for n-gram (3, 3):67.7295918367347
random is :53.166496424923395
SVM Accuracy Score for n-gram (1, 3):74.23469387755102
random is :53.166496424923395
SVM Accuracy Score for n-gram (2, 3):68.23979591836735


### N-Gram (1,3) performs the best we need to try gridsearchcv to fine tune this parameters

In [16]:
#Try different N values
vectorizer = TfidfVectorizer(ngram_range=(1,3),token_pattern=r'\b[a-zA-Z]{3,}\b',stop_words="english",min_df = 10,max_df = 0.7,max_features = 300)

In [17]:
columns = ['brand', 'description', 'brand_category', 'name','details']
obi_tri_model_data=pd.DataFrame()
for j in columns:
    corpus = []
    for i in range(0,len(data_label)):
        corpus.append(data_label.loc[i,j])
    vect = vectorizer.fit_transform(corpus)
    terms = vectorizer.get_feature_names()
    c=pd.DataFrame(vect.toarray().transpose(), index=terms)
    obi_tri_model_data=pd.concat([obi_tri_model_data,c.T],axis = 1)
print(f'The Dimensionality of the data is:{obi_tri_model_data.shape}')

The Dimensionality of the data is:(3916, 1104)


### The best model is used to predict for the entire data

In [46]:
cols = data_label.columns[6:17]
for col in cols:
    x=obi_tri_model_data
    y=data_label[col].values
    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(x,y)
    # predict the labels on validation dataset
    predictions_SVM = SVM.predict(x)
    data_label[col+'_predicted'] = predictions_SVM

In [38]:
#writing the file to output
data_label.to_csv('data_style_predict.csv')