<a href="https://colab.research.google.com/github/deeksha200/SAARTHI.AI-ASSIGNMENT/blob/main/saarthi_ai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**IMPORTING ALL NECESSARY LIBRARIES**

In [44]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize

**READING THE DATA**

In [45]:
train=pd.read_csv("/content/train_data.csv")
train.head()

Unnamed: 0,path,transcription,action,object,location
0,wavs/speakers/xRQE5VD7rRHVdyvM/7372ca00-45c4-1...,Turn on the kitchen lights,activate,lights,kitchen
1,wavs/speakers/R3mexpM2YAtdPbL7/dae28110-44fe-1...,Turn up the temperature,increase,heat,none
2,wavs/speakers/ZebMRl5Z7dhrPKRD/b55dcfd0-455d-1...,OK now switch the main language to Chinese,change language,Chinese,none
3,wavs/speakers/ppzZqYxGkESMdA5Az/61c54a20-4476-...,Turn down the bathroom temperature,decrease,heat,washroom
4,wavs/speakers/zaEBPeMY4NUbDnZy/8ef57ec0-44df-1...,Change the language,change language,none,none


In [46]:
valid=pd.read_csv("/content/valid_data.csv")
valid.head()

Unnamed: 0,path,transcription,action,object,location
0,wavs/speakers/7NqqnAOPVVSKnxyv/8b863c90-4627-1...,Turn on the lights,activate,lights,none
1,wavs/speakers/7NqqnAOPVVSKnxyv/8d8f7a10-4627-1...,Turn off the lights,deactivate,lights,none
2,wavs/speakers/7NqqnAOPVVSKnxyv/8f7773f0-4627-1...,Change language,change language,none,none
3,wavs/speakers/7NqqnAOPVVSKnxyv/916cb440-4627-1...,Pause the music,deactivate,music,none
4,wavs/speakers/7NqqnAOPVVSKnxyv/934321f0-4627-1...,Resume,activate,music,none


In [47]:
train['path'][0]

'wavs/speakers/xRQE5VD7rRHVdyvM/7372ca00-45c4-11e9-8ec0-7bf21d1cfe30.wav'

In [48]:
train['path'][1]

'wavs/speakers/R3mexpM2YAtdPbL7/dae28110-44fe-11e9-a1ea-79ca03012c0e.wav'

In [49]:
train['path']

0        wavs/speakers/xRQE5VD7rRHVdyvM/7372ca00-45c4-1...
1        wavs/speakers/R3mexpM2YAtdPbL7/dae28110-44fe-1...
2        wavs/speakers/ZebMRl5Z7dhrPKRD/b55dcfd0-455d-1...
3        wavs/speakers/ppzZqYxGkESMdA5Az/61c54a20-4476-...
4        wavs/speakers/zaEBPeMY4NUbDnZy/8ef57ec0-44df-1...
                               ...                        
11561    wavs/speakers/BvyakyrDmQfWEABb/a59b2910-4609-1...
11562    wavs/speakers/W4XOzzNEbrtZz4dW/aa186590-44d1-1...
11563    wavs/speakers/xwpvGaaWl5c3G5N3/6e189850-45ba-1...
11564    wavs/speakers/zaEBPeMY4NUbDnZy/3fd3a020-44d4-1...
11565    wavs/speakers/zaEBPeMY4NUbDnZy/3c3d7250-44da-1...
Name: path, Length: 11566, dtype: object

**CHECKING FOR NULL VALUES**

In [50]:
train.isnull().any()

path             False
transcription    False
action           False
object           False
location         False
dtype: bool

In [51]:
train['action'].unique()

array(['activate', 'increase', 'change language', 'decrease',
       'deactivate', 'bring'], dtype=object)

In [52]:
action_list=['activate','deactivate','increase','decrease','change language','bring']
action_list

['activate', 'deactivate', 'increase', 'decrease', 'change language', 'bring']

**UNIQUE VALUES IN THE DATASET**

In [53]:
train['object'].nunique()

14

In [54]:
train['object'].unique()

array(['lights', 'heat', 'Chinese', 'none', 'volume', 'English', 'lamp',
       'shoes', 'newspaper', 'socks', 'music', 'Korean', 'juice',
       'German'], dtype=object)

In [55]:
object_list=['lights', 'heat', 'Chinese', 'none', 'volume', 'English', 'lamp',
       'shoes', 'newspaper', 'socks', 'music', 'Korean', 'juice',
       'German']
object_list

['lights',
 'heat',
 'Chinese',
 'none',
 'volume',
 'English',
 'lamp',
 'shoes',
 'newspaper',
 'socks',
 'music',
 'Korean',
 'juice',
 'German']

In [56]:
train['location'].unique()

array(['kitchen', 'none', 'washroom', 'bedroom'], dtype=object)

In [57]:
location_list=['kitchen', 'none', 'washroom', 'bedroom']
location_list

['kitchen', 'none', 'washroom', 'bedroom']

**DATA CLEANING**

In [59]:
import re
import string
from nltk.corpus import stopwords

def clean_text(text):
    """Process text function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet
    """
    lemmatizer = WordNetLemmatizer()
    stopwords_english = stopwords.words('english')
    text= re.sub('\[[^]]*\]', '', text)
    # remove stock market tickers like $GE
    text = re.sub(r'\$\w*', '', text)
    #removal of html tags
    review =re.sub(r'<.*?>',' ',text) 
    # remove old style retweet text "RT"
    text = re.sub(r'^RT[\s]+', '', text)
    # remove hyperlinks
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    # remove hashtags
    # only removing the hash # sign from the word
    text = re.sub(r'#', '', text)
    text = re.sub("["
                           u"\U0001F600-\U0001F64F"  # removal of emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+",' ',text)
    text = re.sub('[^a-zA-Z]',' ',text) 
    text = text.lower()
    text_tokens =word_tokenize(text)

    text_clean = []
    for word in  text_tokens:
        if (                                       
            word not in string.punctuation):  # remove punctuation
            lem_word =lemmatizer.lemmatize(word)  # lemmitiging word
            text_clean.append(lem_word)
    text_mod=[i for i in text_clean if len(i)>1]
    text_clean=' '.join(text_mod)
    return  text_clean

In [60]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [61]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [62]:
train['clean_text']=train['transcription'].apply(lambda x: clean_text(x))

In [63]:
train['clean_text']

0                         turn on the kitchen light
1                           turn up the temperature
2        ok now switch the main language to chinese
3                turn down the bathroom temperature
4                               change the language
                            ...                    
11561                               kitchen heat up
11562                       turn the temperature up
11563                                    bring shoe
11564                                   volume mute
11565                    turn off the kitchen light
Name: clean_text, Length: 11566, dtype: object

In [64]:
valid['clean_text']=valid['transcription'].apply(lambda x:clean_text(x))
valid['clean_text']

0          turn on the light
1         turn off the light
2            change language
3            pause the music
4                     resume
                ...         
3113                light on
3114    switch off the light
3115      turn the light off
3116               light off
3117               volume up
Name: clean_text, Length: 3118, dtype: object

In [65]:
train['clean_text'][7]

'it too loud turn it down'

In [66]:
valid.shape

(3118, 6)

In [67]:
train.shape

(11566, 6)

In [68]:
train.drop(['transcription','path'],axis=1,inplace=True)
train.shape

(11566, 4)

In [69]:
valid.drop(['transcription','path'],axis=1,inplace=True)
valid.shape

(3118, 4)

In [70]:
train.head()

Unnamed: 0,action,object,location,clean_text
0,activate,lights,kitchen,turn on the kitchen light
1,increase,heat,none,turn up the temperature
2,change language,Chinese,none,ok now switch the main language to chinese
3,decrease,heat,washroom,turn down the bathroom temperature
4,change language,none,none,change the language


In [71]:
train['labels']=train['action']+' '+train['object']+' '+train['location']
train.head()

Unnamed: 0,action,object,location,clean_text,labels
0,activate,lights,kitchen,turn on the kitchen light,activate lights kitchen
1,increase,heat,none,turn up the temperature,increase heat none
2,change language,Chinese,none,ok now switch the main language to chinese,change language Chinese none
3,decrease,heat,washroom,turn down the bathroom temperature,decrease heat washroom
4,change language,none,none,change the language,change language none none


In [72]:
valid.head()

Unnamed: 0,action,object,location,clean_text
0,activate,lights,none,turn on the light
1,deactivate,lights,none,turn off the light
2,change language,none,none,change language
3,deactivate,music,none,pause the music
4,activate,music,none,resume


In [73]:
valid['labels']=valid['action']+' '+valid['object']+' '+valid['location']
valid.head()

Unnamed: 0,action,object,location,clean_text,labels
0,activate,lights,none,turn on the light,activate lights none
1,deactivate,lights,none,turn off the light,deactivate lights none
2,change language,none,none,change language,change language none none
3,deactivate,music,none,pause the music,deactivate music none
4,activate,music,none,resume,activate music none


In [74]:
train.drop(['action','object','location'],axis=1,inplace=True)
train.head()

Unnamed: 0,clean_text,labels
0,turn on the kitchen light,activate lights kitchen
1,turn up the temperature,increase heat none
2,ok now switch the main language to chinese,change language Chinese none
3,turn down the bathroom temperature,decrease heat washroom
4,change the language,change language none none


In [75]:
valid.drop(['action','object','location'],axis=1,inplace=True)
valid.head()

Unnamed: 0,clean_text,labels
0,turn on the light,activate lights none
1,turn off the light,deactivate lights none
2,change language,change language none none
3,pause the music,deactivate music none
4,resume,activate music none


**TF-IDF VECTORIZATION**

In [83]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

vectorizer = TfidfVectorizer()
vectorised_train_documents = vectorizer.fit_transform(train["clean_text"])
vectorised_test_documents = vectorizer.transform(valid["clean_text"])

In [84]:
train_categories=train['labels']
train_categories.head()

0         activate lights kitchen
1              increase heat none
2    change language Chinese none
3          decrease heat washroom
4       change language none none
Name: labels, dtype: object

In [87]:
test_categories=valid['labels']
test_categories.head()

0         activate lights none
1       deactivate lights none
2    change language none none
3        deactivate music none
4          activate music none
Name: labels, dtype: object

**VECTORIZE OUTPUT LABELS**

In [88]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(train_categories)
test_labels = mlb.transform(test_categories)

In [89]:
train_labels

array([[1, 0, 0, ..., 0, 1, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 1, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 1, 0],
       [1, 0, 0, ..., 0, 1, 0]])

In [90]:
test_labels

array([[1, 0, 0, ..., 0, 1, 0],
       [1, 0, 0, ..., 0, 1, 0],
       [1, 0, 0, ..., 1, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 1, 0],
       [1, 0, 0, ..., 0, 1, 0],
       [1, 0, 0, ..., 1, 1, 0]])

**KNN MODEL**

In [102]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier

knnClf = KNeighborsClassifier()

knnClf.fit(vectorised_train_documents, train_labels)
knnPredictions = knnClf.predict(vectorised_test_documents)
metricsReport("knn", test_labels, knnPredictions)

**RANDOM FOREST CLASSIFIER**

In [103]:
from sklearn.ensemble import RandomForestClassifier
rfClassifier = RandomForestClassifier(n_jobs=-1)
rfClassifier.fit(vectorised_train_documents, train_labels)
rfPreds = rfClassifier.predict(vectorised_test_documents)
metricsReport("Random Forest", test_labels, rfPreds)

**BAGGING CLASSIFIER**

In [104]:
from sklearn.ensemble import BaggingClassifier

bagClassifier = OneVsRestClassifier(BaggingClassifier(n_jobs=-1))
bagClassifier.fit(vectorised_train_documents, train_labels)
bagPreds = bagClassifier.predict(vectorised_test_documents)
metricsReport("Bagging", test_labels, bagPreds)

  str(classes[c]))
  str(classes[c]))


**GRADIENT BOOSTING CLASSIFIER**

In [106]:
from sklearn.ensemble import GradientBoostingClassifier

boostClassifier = OneVsRestClassifier(GradientBoostingClassifier())
boostClassifier.fit(vectorised_train_documents, train_labels)
boostPreds = boostClassifier.predict(vectorised_test_documents)
metricsReport("Boosting", test_labels, boostPreds)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
