## Objective

The objective of this notebook is to generate a simple SVM classifier. The data used is a from a whatsapp group called GOATS (Go out and tour somewhere). We classify the data into four kinds of categories:
    
    1 New to community

    2 Tour plan timings

    3 Where do i buy this gear?

    4 Other doubts

## Data

In [3]:
import csv
import pandas as pd
trgData = pd.DataFrame(columns=['question','category'])

with open('cyclingGroupChat.csv','r') as f:
    reader=csv.reader(f)
    for line1,line2 in reader:
        temp2 = pd.DataFrame([[line1,line2]],columns=['question','category'])
        trgData = trgData.append(temp2,ignore_index=True)

In [4]:
trgData.head()

Unnamed: 0,question,category
0,You were added,1
1,Bike/ car ride to Mulliyangiri and camping at ...,2
2,Chikmangalur for lunch post lunch head to Hebb...,2
3,It's motors bike trip and trek,2
4,I am d organiser any doubts I will clarify,4


In [5]:
trgData['category'].unique()

array(['1', '2', '4', '3'], dtype=object)

In [6]:
import cPickle
stopWords = cPickle.load(open('stopWords.p', 'rb'))

In [7]:
keywordsList = []
for line in trgData['question']:
    temp = line.split()
    tempList = []
    toClean = []
    for word in temp:
        tempWord = [e.lower() for e in word if e.isalnum()]
        toClean.append(''.join(tempWord))
    keywordsListTemp = [i for i in toClean if i not in stopWords]
    keywordsList.append(' '.join(keywordsListTemp))

In [8]:
trgData2 = pd.DataFrame({'question': keywordsList, 'category': list(trgData['category'])})

In [9]:
trgData2

Unnamed: 0,category,question
0,1,
1,2,bike car ride mulliyangiri camping chikmangalu...
2,2,chikmangalur lunch post lunch head hebbe falls...
3,2,motors bike trip trek
4,4,organiser doubts clarify
5,4,chikamagalur 240 kms
6,2,reach lunch day
7,2,anyways details
8,1,friend group message interested
9,3,bike choose budget


In [10]:
toDrop = []
for i in xrange(len(trgData2)):
    if trgData2['category'][i] == '0':
        toDrop.append(i)

In [11]:
trgData2 = trgData2.drop(trgData2.index[ toDrop ])

In [12]:
trgData2.head(15)

Unnamed: 0,category,question
0,1,
1,2,bike car ride mulliyangiri camping chikmangalu...
2,2,chikmangalur lunch post lunch head hebbe falls...
3,2,motors bike trip trek
4,4,organiser doubts clarify
5,4,chikamagalur 240 kms
6,2,reach lunch day
7,2,anyways details
8,1,friend group message interested
9,3,bike choose budget


In [13]:
trgData = pd.DataFrame(columns=['question','category'])
for i in xrange(len(trgData2)):
    if len(trgData2.iloc[i]['category'])>1 :
        tempList = trgData2.iloc[i]['category'].split(',')
        for j in tempList:
            temp2 = pd.DataFrame([[j,trgData2.iloc[i]['question']]],columns=['category','question'])
            trgData = trgData.append(temp2,ignore_index=True)
    else:
        trgData = trgData.append(trgData2.iloc[i])

In [14]:
trgData.head(15)

Unnamed: 0,question,category
0,,1
1,bike car ride mulliyangiri camping chikmangalu...,2
2,chikmangalur lunch post lunch head hebbe falls...,2
3,motors bike trip trek,2
4,organiser doubts clarify,4
5,chikamagalur 240 kms,4
6,reach lunch day,2
7,anyways details,2
8,friend group message interested,1
9,bike choose budget,3


In [15]:
trgData['category'].unique()

array(['1', '2', '4', '3'], dtype=object)

In [16]:
print len(trgData)

68


In [17]:
from sklearn.cross_validation import KFold
kf = KFold(244, n_folds=4)



In [18]:
X = list(trgData['question'])
Y = list(trgData['category'])

   ## Categories
                 1 New to community

                 2 Tour plan timings

                 3 Where do i buy this gear?

                 4 Other doubts

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform( X )
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)),])
text_clf = text_clf.fit( X , Y )


In [20]:
predicted = text_clf.predict(["Hi, I am new to the group","Where will I get this bike which has 24 gears","Destination unknown meet around 12AM","I am a loser","What is this community about"])
print (predicted)

['1' '3' '2' '4' '1']


In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics

kf = KFold(len(trgData), n_folds=6)

for train_index, test_index in kf:
    trainSet_x = []
    testSet_x = []
    trainSet_y = []
    testSet_y = []
    for i in train_index:
        trainSet_x.append(X[i])
        trainSet_y.append(Y[i])
    for i in test_index:
        testSet_x.append(X[i])
        testSet_y.append(Y[i])
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform( trainSet_x )
    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)),])
    text_clf = text_clf.fit( trainSet_x , trainSet_y )
    predicted = text_clf.predict(testSet_y)
    print(metrics.classification_report( testSet_y , predicted, target_names=['1','2','3','4']))
    

             precision    recall  f1-score   support

          1       0.00      0.00      0.00         2
          2       0.00      0.00      0.00         5
          3       0.17      1.00      0.29         2
          4       0.00      0.00      0.00         3

avg / total       0.03      0.17      0.05        12

             precision    recall  f1-score   support

          1       0.00      0.00      0.00         2
          2       0.08      1.00      0.15         1
          3       0.00      0.00      0.00         9

avg / total       0.01      0.08      0.01        12

             precision    recall  f1-score   support

          1       0.00      0.00      0.00         1
          2       0.00      0.00      0.00         4
          3       0.45      1.00      0.62         5
          4       0.00      0.00      0.00         1

avg / total       0.21      0.45      0.28        11

             precision    recall  f1-score   support

          1       0.00      0.00    

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [23]:
import pickle
pickle.dump(text_clf,open("classifier.p", "wb"))

In [27]:
classifier = pickle.load(open("classifier.p","rb"))
classifier.predict(["Hi, I am new to the group"])

## Conclusion

In [32]:
classifier = pickle.load(open("classifier.p","rb"))

def classify(sentence):
    return (classifier.predict([sentence]))  

classify("Hi, I am new to the group")

array(['1'], 
      dtype='|S1')