# NDSC Beginner (Text Classification)

### File preparation

In [None]:
import pandas as pd
import math as math
import sys

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# Read from the file
df = pd.read_csv("../input/train.csv")

# sample only the first few records
df = df.sample(frac=1)

df_train = df[:]

# get the big label from the image path
output = []

for label in enumerate(df_train['image_path']):
    output.append(label[1].split('/')[0].split('_')[0].capitalize())
df_train['Category_Type'] = output
    
# print the first view for vieweing
df_train.head()
df_train.info()

sys.stdout.write('File preparation done...')

### Convert json file into dictonary

In [None]:
import json
import pprint

with open('input/categories.json') as f:
    categories_json = json.load(f)
    
id_to_category = {}
category_to_id = {}

for category_class in categories_json:
    for category_name in categories_json[category_class]:
        id_to_category[categories_json[category_class][category_name]] = category_name
        category_to_id[category_name] = categories_json[category_class][category_name]
        
pprint.pprint(id_to_category)
sys.stdout.write('Conversion of JSON done...')

### Data Cleansing

In [None]:
import nltk
import re

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

# remove single charcter
def keepAlpha(sentence):
    alpha_sent = re.sub(r'\s+[a-zA-Z]\s+', ' ', sentence)
    return alpha_sent

# remove digit
def removeDigit(sentence):
    result_sent = re.sub(r'\d', 'X', sentence)
    return result_sent

# Cats -> Cat
def lemmatization (sentence):
    result_sent = ""
    stemmer = WordNetLemmatizer()
    
    sentence = str(sentence)
    result_sent = sentence.split()
    result_sent = [stemmer.lemmatize(word) for word in sentence]
    result_sent = ''.join(result_sent)

    return result_sent

# amusing, amusement, and amused, the stem would be amuse
def stemming(sentence):
    stemSentence = ""
    stemmer = SnowballStemmer("english")
    
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

df_train['title'] = df_train['title'].str.lower()
df_train['title'] = df_train['title'].apply(keepAlpha)    
df_train['title'] = df_train['title'].apply(removeDigit)   
# df_train['title'] = df_train['title'].apply(stemming) # increases or decreases accuracy depending on data
# df_train['title'] = df_train['title'].apply(lemmatization) 

print(df_train['title'])
sys.stdout.write('Data cleansing done...')

### Data Exploration

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(14,8))
ax = df_train.groupby('Category_Type').title.count().plot.bar(title="Category Distribution")
ax.set_ylabel("Frequency")
plt.show()

### Feature Extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# sublinear_df is set to True to use a logarithmic form for frequency.
# min_df is the minimum numbers of documents a word must be present in to be kept.
# norm is set to l2, to ensure all our feature vectors have a euclidian norm of 1.
# ngram_range is set to (1, 2) to indicate that we want to consider both unigrams and bigrams.
# stop_words -> remove stopwords
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df_train['title']).toarray()
labels = df_train.Category

features.shape
sys.stdout.write('Feature Extraction done...')

In [None]:
from sklearn.feature_selection import chi2
import numpy as np

N = 2

for Product, category_id in sorted(category_to_id.items()):
    
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    #print("# '{}':".format(Product))
    #print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    #print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

sys.stdout.write('Chi2 extraction done...')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

X_train, X_test, y_train, y_test = train_test_split(df_train['title'], df_train['Category'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) 

sys.stdout.write('TFIDF done...')

### Naive Bayes

In [None]:
# from sklearn.naive_bayes import MultinomialNB
# clf = MultinomialNB().fit(X_train_tfidf, y_train)

# print(clf.predict(count_vect.transform(["new jaminan mut cream baby pink original pemutih kulit wajah obat jerawat perawatan"])))

### Polynomial Kernel

In [None]:
# SVM with Gaussian Kernel
from sklearn.svm import SVC
from sklearn.datasets import samples_generator
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

clf = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("svm_clf", SVC(kernel="poly", degree=9, coef0=1, C=5))
])

clf.fit(X_train_tfidf, y_train)
sys.stdout.write('Polynomial Kernal done...')
# y_pred = clf.predict(count_vect.transform(X_test).astype(np.float64))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns;

def plot_graph():
    conf_mat = confusion_matrix(y_test, y_pred)
    fig, ax = plt.subplots(figsize=(25,25))
    sns.heatmap(conf_mat, annot=True, fmt='d',
                xticklabels=category_to_id.items(), yticklabels=category_to_id.items())
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    
plot_graph()
sys.stdout.write('Plotting of Graph done...')

### Trying out different models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

def diffModel():
    models = [
        RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
        LinearSVC(),
        MultinomialNB(),
        LogisticRegression(random_state=0),
    ]
    CV = 5
    cv_df = pd.DataFrame(index=range(CV * len(models)))
    entries = []
    for model in models:
      model_name = model.__class__.__name__
      accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
      for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
    cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
    import seaborn as sns
    sns.boxplot(x='model_name', y='accuracy', data=cv_df)
    sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
                  size=8, jitter=True, edgecolor="gray", linewidth=2)
    plt.show()
    
# diffModel()

### Use the learning model and export to csv file

In [None]:
from tqdm import tqdm  # progress bar

from sklearn.metrics import precision_score, recall_score

def checker (df_train):
    count = 0
    
    for itemID, title, category in tqdm(zip(df_train['itemid'], df_train['title'], df_train['Category'])):
        ans = clf.predict(count_vect.transform([title]).astype(np.float64))
        if (ans == category):
            count += 1
    
    return count
    
#print (precision_score(count_vect.transform(df_train['Category'], count_vect.transform(X_test).astype(np.float64)))
#print("{} / {} is correct".format(checker (df_train), len(df_train)))

In [None]:
import pandas as pd
from collections import OrderedDict
from tqdm import tqdm  # progress bar

# Read from the file
df_test = pd.read_csv("../input/test.csv")
df_test = df_test[:]

def checkTest (df_test):
    result = {}
    
    result['itemid'] = 'Category'
    
    for itemID, title in tqdm(zip(df_test['itemid'], df_test['title'])):
        result[itemID] = clf.predict(count_vect.transform([title]))[0]
    
    # OrderedDict.fromkeys(result, True)
    
    return result

result = checkTest (df_test)

with open('result.csv', 'w') as f:
    [f.write('{},{}\n'.format(key, value)) for key, value in result.items()]
sys.stdout.write('Writing to kernel done...')