### Load libraries

In [1]:
import numpy as np 
import string

import nltk
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

### Pre_Process sentences and taking only Noun for training

In [2]:
def pre_process_words(word):
    lemmatizer = WordNetLemmatizer()
    porter = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    x=nltk.pos_tag([word])
    if x[0][1] not in ["NN","NNS","NNP"]:
        return None
    if word in stop_words:
        return None
    word=lemmatizer.lemmatize(word)
    return re.findall(r'(?:[a-zA-Z]+[a-zA-Z\'\-]?[a-zA-Z]|[a-zA-Z] +)',word)

In [3]:
def pre_process(txt):
    txt=txt.lower()
    tokens = nltk.word_tokenize(txt)
    new_txt=""
    for i in tokens:
        temp=pre_process_words(i)
        if(temp):
            new_txt=new_txt+" "+temp[0]     
        
    return new_txt

### Generating Dataset

In [4]:
def generate_dataset(path):
    x_train=[]
    y_train=[]
    c=0
    with open(path, 'r') as file:
        for line in file:
            c+=1
            if c==1:
                continue
            x=[a for a in line.rstrip().split("\t")]
            sen=pre_process(x[0])
            x_train.append(sen)
            y_train.append(x[1])
            
    return x_train,y_train

In [5]:
x_train,y_train=generate_dataset("dataset/training.txt")
x=np.array(x_train)
y=np.array(y_train)


### Creating Pipeline for Classification Of Data and training Data

In [6]:
text_clf=Pipeline([('vect',CountVectorizer()),   #convert text to vectors
                   ('tfidf',TfidfTransformer()), #normalizing Data
                   ('clf', LinearSVC())])        #using Support Vector Machine With Linear Kernel 
text_clf.fit(x,y)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

### Testing Model

In [7]:
test=[]
for i in range(int(input())): 
    x=input()
    sen=" ".join(word for word in pre_process_words(x))
    test.append(x)
predicted=text_clf.predict(np.array(test))
for i in predicted:
    print(i)

1
Canon EOS 1100D SLR (Black, with Kit (EF S18-55 III))
dslr canon
