In [1]:
import nltk
import pandas as pd
import re
import string

## 1. Read your data

### 1a.

In [2]:
dataset = pd.read_csv('Youtube05-Shakira.csv')

### 1b.

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370 entries, 0 to 369
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   COMMENT_ID  370 non-null    object
 1   AUTHOR      370 non-null    object
 2   DATE        370 non-null    object
 3   CONTENT     370 non-null    object
 4   CLASS       370 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 14.6+ KB


In [4]:
dataset.shape

(370, 5)

In [5]:
dataset.head(3)

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,z13lgffb5w3ddx1ul22qy1wxspy5cpkz504,dharma pal,2015-05-29T02:30:18.971000,Nice song﻿,0
1,z123dbgb0mqjfxbtz22ucjc5jvzcv3ykj,Tiza Arellano,2015-05-29T00:14:48.748000,I love song ﻿,0
2,z12quxxp2vutflkxv04cihggzt2azl34pms0k,Prìñçeśś Âliś Łøvê Dømíñø Mâđiś™ ﻿,2015-05-28T21:00:08.607000,I love song ﻿,0


### Using only one column 'CONTENT' for the dataset

In [6]:
dataset = dataset.CONTENT

### 1c.

In [7]:
dataset.isnull().sum()

0

In [8]:
dataset.head(5)

0                                           Nice song﻿
1                                        I love song ﻿
2                                        I love song ﻿
3    860,000,000 lets make it first female to reach...
4                        shakira is best for worldcup﻿
Name: CONTENT, dtype: object

---

## 2. Remove Punctuatuions

### 2a.

In [9]:
def remove_punctuatuion(comment):
    return re.sub(r'[^\w\s]', '', comment)

In [10]:
no_punct = pd.Series(dataset.map(lambda x:remove_punctuatuion(x)), name="punct_text")

In [11]:
no_punct.head()

0                                            Nice song
1                                         I love song 
2                                         I love song 
3    860000000 lets make it first female to reach o...
4                         shakira is best for worldcup
Name: punct_text, dtype: object

In [12]:
dataset = pd.concat([dataset, no_punct], axis=1)

### 2b.

In [13]:
dataset.head(10)

Unnamed: 0,CONTENT,punct_text
0,Nice song﻿,Nice song
1,I love song ﻿,I love song
2,I love song ﻿,I love song
3,"860,000,000 lets make it first female to reach...",860000000 lets make it first female to reach o...
4,shakira is best for worldcup﻿,shakira is best for worldcup
5,The best world cup song ever!!!!﻿,The best world cup song ever
6,I love﻿,I love
7,SEE SOME MORE SONG OPEN GOOGLE AND TYPE Shakir...,SEE SOME MORE SONG OPEN GOOGLE AND TYPE Shakir...
8,Awesome ﻿,Awesome
9,I like shakira..﻿,I like shakira


---

## 3. Tokenization

### 3a., 3c.

In [14]:
from nltk.tokenize import word_tokenize

In [16]:
token_text = pd.Series(dataset['punct_text'].map(lambda x:word_tokenize(x.lower())), name="token_text")

In [17]:
dataset = pd.concat([dataset, token_text], axis=1)

### 3b.

In [18]:
dataset.head(10)

Unnamed: 0,CONTENT,punct_text,token_text
0,Nice song﻿,Nice song,"[nice, song]"
1,I love song ﻿,I love song,"[i, love, song]"
2,I love song ﻿,I love song,"[i, love, song]"
3,"860,000,000 lets make it first female to reach...",860000000 lets make it first female to reach o...,"[860000000, lets, make, it, first, female, to,..."
4,shakira is best for worldcup﻿,shakira is best for worldcup,"[shakira, is, best, for, worldcup]"
5,The best world cup song ever!!!!﻿,The best world cup song ever,"[the, best, world, cup, song, ever]"
6,I love﻿,I love,"[i, love]"
7,SEE SOME MORE SONG OPEN GOOGLE AND TYPE Shakir...,SEE SOME MORE SONG OPEN GOOGLE AND TYPE Shakir...,"[see, some, more, song, open, google, and, typ..."
8,Awesome ﻿,Awesome,[awesome]
9,I like shakira..﻿,I like shakira,"[i, like, shakira]"


---

## 4. Remove Stopwords

### 4a.

In [19]:
from nltk.corpus import stopwords

In [20]:
def remove_stopwords(list_of_tokens):
    filtered_words = [word for word in list_of_tokens if word not in stopwords.words('english')]
    return filtered_words

In [21]:
nostop_text = pd.Series(dataset['token_text'].map(lambda x:remove_stopwords(x)), name="nostop_text")

In [22]:
dataset = pd.concat([dataset, nostop_text], axis=1)

### 4b.

In [23]:
dataset.head()

Unnamed: 0,CONTENT,punct_text,token_text,nostop_text
0,Nice song﻿,Nice song,"[nice, song]","[nice, song]"
1,I love song ﻿,I love song,"[i, love, song]","[love, song]"
2,I love song ﻿,I love song,"[i, love, song]","[love, song]"
3,"860,000,000 lets make it first female to reach...",860000000 lets make it first female to reach o...,"[860000000, lets, make, it, first, female, to,...","[860000000, lets, make, first, female, reach, ..."
4,shakira is best for worldcup﻿,shakira is best for worldcup,"[shakira, is, best, for, worldcup]","[shakira, best, worldcup]"


---

## 5. Stem

### 5a.

In [26]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

In [29]:
def porter_stemmer(plurals, stemmer):
    singles = [stemmer.stem(plural) for plural in plurals]
    return singles

In [30]:
stemmed_text = pd.Series(dataset['nostop_text'].map(lambda x:porter_stemmer(x, stemmer)), name="stemmed_text")

In [31]:
dataset = pd.concat([dataset, stemmed_text], axis=1)

In [34]:
dataset.head()

Unnamed: 0,CONTENT,punct_text,token_text,nostop_text,stemmed_text
0,Nice song﻿,Nice song,"[nice, song]","[nice, song]","[nice, song]"
1,I love song ﻿,I love song,"[i, love, song]","[love, song]","[love, song]"
2,I love song ﻿,I love song,"[i, love, song]","[love, song]","[love, song]"
3,"860,000,000 lets make it first female to reach...",860000000 lets make it first female to reach o...,"[860000000, lets, make, it, first, female, to,...","[860000000, lets, make, first, female, reach, ...","[860000000, let, make, first, femal, reach, on..."
4,shakira is best for worldcup﻿,shakira is best for worldcup,"[shakira, is, best, for, worldcup]","[shakira, best, worldcup]","[shakira, best, worldcup]"


In [36]:
dataset[['nostop_text', 'stemmed_text']].head()

Unnamed: 0,nostop_text,stemmed_text
0,"[nice, song]","[nice, song]"
1,"[love, song]","[love, song]"
2,"[love, song]","[love, song]"
3,"[860000000, lets, make, first, female, reach, ...","[860000000, let, make, first, femal, reach, on..."
4,"[shakira, best, worldcup]","[shakira, best, worldcup]"


## 6. Lemmatize

### 6a.

In [45]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [None]:
nltk.download('wordnet')

### 6b.

In [41]:
def lemmatizer(word_list, wnl):
    lemmatized_words = [wnl.lemmatize(w) for w in word_list]
    return lemmatized_words

In [42]:
lemmatized_text = pd.Series(dataset['nostop_text'].map(lambda x:lemmatizer(x, wnl)), name="lemmatized_text")

In [43]:
dataset = pd.concat([dataset, lemmatized_text], axis=1)

### 6c.

In [44]:
dataset.head()

Unnamed: 0,CONTENT,punct_text,token_text,nostop_text,stemmed_text,lemmatized_text
0,Nice song﻿,Nice song,"[nice, song]","[nice, song]","[nice, song]","[nice, song]"
1,I love song ﻿,I love song,"[i, love, song]","[love, song]","[love, song]","[love, song]"
2,I love song ﻿,I love song,"[i, love, song]","[love, song]","[love, song]","[love, song]"
3,"860,000,000 lets make it first female to reach...",860000000 lets make it first female to reach o...,"[860000000, lets, make, it, first, female, to,...","[860000000, lets, make, first, female, reach, ...","[860000000, let, make, first, femal, reach, on...","[860000000, let, make, first, female, reach, o..."
4,shakira is best for worldcup﻿,shakira is best for worldcup,"[shakira, is, best, for, worldcup]","[shakira, best, worldcup]","[shakira, best, worldcup]","[shakira, best, worldcup]"


---

## 7. Vectorizing

### 7a.

In [46]:
from sklearn.feature_extraction.text import CountVectorizer