In [1]:
import pandas as pd
import numpy as np


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer #used for multi label classification
from sklearn.model_selection import train_test_split

#machine learning algorithms
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

#we use the below because we will be using the logistic regression and multilabel binarizer
from sklearn.multiclass import OneVsRestClassifier # will do a classification of one class vs another(multiclass/multi label strategy)

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/stackoverflow.csv",index_col=0)
df.head()

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"['sql', 'asp.net']"
4,adding scripting functionality to net applicat...,"['c#', '.net']"
5,should i use nested classes in this case i am ...,['c++']
6,homegrown consumption of web services i have b...,['.net']
8,automatically update version number i would li...,['c#']


In [4]:
import ast
# df["Tags"].iloc[0] #the tags are a list inside a string
# ast.literal_eval(df["Tags"].iloc[0]) #converts the string into  list
df["Tags"] = df["Tags"].apply(lambda x: ast.literal_eval(x) )# converts the whole column from string to list
df.head()

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"[sql, asp.net]"
4,adding scripting functionality to net applicat...,"[c#, .net]"
5,should i use nested classes in this case i am ...,[c++]
6,homegrown consumption of web services i have b...,[.net]
8,automatically update version number i would li...,[c#]


In [5]:
multilabel = MultiLabelBinarizer() #need to create a multilabelbinarizer object

In [6]:
y = multilabel.fit_transform(df["Tags"])
# pd.DataFrame(y)
pd.DataFrame(y,columns=multilabel.classes_) #multilabel classifiers expect such

Unnamed: 0,.net,android,asp.net,c,c#,c++,css,html,ios,iphone,java,javascript,jquery,mysql,objective-c,php,python,ruby,ruby-on-rails,sql
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48971,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48972,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48973,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
48974,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [7]:
len(multilabel.classes_)

20

In [8]:
multilabel.classes_

array(['.net', 'android', 'asp.net', 'c', 'c#', 'c++', 'css', 'html',
       'ios', 'iphone', 'java', 'javascript', 'jquery', 'mysql',
       'objective-c', 'php', 'python', 'ruby', 'ruby-on-rails', 'sql'],
      dtype=object)

In [None]:
# TfidfVectorizer -> term frequency multiplication with inverse document frequency
tfidf = TfidfVectorizer(analyzer="word",max_features=5000, ngram_range=(1,1), stop_words="english")#if word is selected, tokenization will be done word by word, if char is selected ....
x = tfidf.fit_transform(df["Text"]).toarray()
pd.DataFrame(x)

In [None]:
tfidf.vocabulary_ #the features it has learned -_ this shows us what has been tokenized 
# tfidf.stop_words_#shows stop words we are using

In [103]:
x.shape, y.shape #shows rows vs features(columns)

((48976, 5000), (48976, 20))

In [89]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)#X is feature, y is target,stratify will make sure we divide an equal ratio in train and test data sample

## Build Model

In [99]:
sgd = SGDClassifier()#stochastic gradient descent algorithm
lr = LogisticRegression(solver="lbfgs") #the lbfgs is the algorithm to use in the optimization problem(for multiclass problems)
svc = LinearSVC()

In [100]:
def j_score(y_true,y_pred): #to help us know how the model is performing
    jaccard = np.minimum(y_true,y_pred).sum(axis=1)/np.maximum(y_true,y_pred).sum(axis=1)
    return jaccard.mean()*100

def print_score(y_pred,clf):
    print("clf: ",clf.__class__.__name__)
    print("Jaccard Score: {}".format(j_score(y_test,y_pred)))
    print("-----------------------------------------")

In [101]:
for classifier in [svc]:
    clf = OneVsRestClassifier(classifier) #out of the 20 classes it will select 1 at a time and the other 19 as all other classes and will do the same for every data 
    clf.fit(x_train,y_train)
    y_pred = clf.predict(x_test)
    print("y_pred:{}".format(y_pred))
    print_score(y_pred,classifier)

y_pred:[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
clf:  LinearSVC
Jaccard Score: 61.109126173948546
-----------------------------------------


## below cell compares model performance

In [86]:
# for classifier in [sgd,lr,svc]:
#     clf = OneVsRestClassifier(classifier) #out of the 20 classes it will select 1 at a time and the other 19 as all other classes and will do the same for every data 
#     clf.fit(x_train,y_train)
#     y_pred = clf.predict(x_test)
#     print("y_pred:{}".format(y_pred))
#     print_score(y_pred,classifier)

## Model test with real data

In [93]:
x = ["how to write ml code in python and java i have data but dont know what to do with it"]

In [94]:
xt = tfidf.transform(x)
xt
# pd.DataFrame(xt)#for viewing purposes

<1x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 17 stored elements in Compressed Sparse Row format>

In [95]:
arr = clf.predict(xt)
pd.DataFrame(arr,columns=multilabel.classes_)

Unnamed: 0,.net,android,asp.net,c,c#,c++,css,html,ios,iphone,java,javascript,jquery,mysql,objective-c,php,python,ruby,ruby-on-rails,sql
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0


In [96]:
multilabel.inverse_transform(clf.predict(xt))

[('java', 'python')]