1. Preprocess the dataset to convert it into a format that the algorithm can work with. 
- Perform pre-processing steps like Removing Punctuations, Numbers, and Special Characters, Stop Words in dataset.               (1Mark)
- Perform normalization by using Stemming or Lemmatization.  (1Mark)

In [3]:
import pandas as pa
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
english_stop_words=set(stopwords.words("english"))
wordnet_lemmatizer = WordNetLemmatizer()
porter_stemmer = PorterStemmer()

In [4]:
ds = pa.read_csv("boston airbnb reviews.csv")
print(f"Loaded dataset contains {ds.shape[0]} records.")
print(f"Check and remove null and duplicate records...")
null_records = ds[ds['comments'].isnull()]
ds = ds.dropna()
ds = ds.drop_duplicates()
print(f"Total records after removing null and duplicate records : {ds.shape[0]}")
def pre_processing(text):
    words = word_tokenize(text)
    only_words = [w for w in words if not w in string.punctuation]
    meaningful_words = [w for w in only_words if not w in english_stop_words]
    lemmatized_list = [wordnet_lemmatizer.lemmatize(w)  for w in meaningful_words]
    stemmed_list = [ porter_stemmer.stem(w) for w in lemmatized_list ]
    return stemmed_list 

ds['processed_comments'] = ds['comments'].apply(pre_processing)
ds.head()

Loaded dataset contains 68275 records.
Check and remove null and duplicate records...
Total records after removing null and duplicate records : 68222


Unnamed: 0,listing_id,id,reviewer_id,reviewer_name,comments,processed_comments
0,1178162,4724140,4298113,Olivier,My stay at islam's place was really cool! Good...,"[my, stay, islam, 's, place, realli, cool, goo..."
1,1178162,4869189,6452964,Charlotte,Great location for both airport and city - gre...,"[great, locat, airport, citi, great, amen, hou..."
2,1178162,5003196,6449554,Sebastian,We really enjoyed our stay at Islams house. Fr...,"[we, realli, enjoy, stay, islam, hous, from, o..."
3,1178162,5150351,2215611,Marine,The room was nice and clean and so were the co...,"[the, room, nice, clean, commod, veri, close, ..."
4,1178162,5171140,6848427,Andrew,Great location. Just 5 mins walk from the Airp...,"[great, locat, just, 5, min, walk, airport, st..."


In [5]:
ds['comments'][0]

"My stay at islam's place was really cool! Good location, 5min away from subway, then 10min from downtown. The room was nice, all place was clean. Islam managed pretty well our arrival, even if it was last minute ;) i do recommand this place to any airbnb user :)"

In [6]:
ds['processed_comments'][0]

['my',
 'stay',
 'islam',
 "'s",
 'place',
 'realli',
 'cool',
 'good',
 'locat',
 '5min',
 'away',
 'subway',
 '10min',
 'downtown',
 'the',
 'room',
 'nice',
 'place',
 'clean',
 'islam',
 'manag',
 'pretti',
 'well',
 'arriv',
 'even',
 'last',
 'minut',
 'recommand',
 'place',
 'airbnb',
 'user']

2. Apply a POS tagging algorithm or utilize a pretrained POS tagger to assign POS tags to the words in the dataset. (3Marks)

In [7]:
ds['tagged'] = ds['processed_comments'].apply(nltk.pos_tag)

In [8]:
ds.head()


Unnamed: 0,listing_id,id,reviewer_id,reviewer_name,comments,processed_comments,tagged
0,1178162,4724140,4298113,Olivier,My stay at islam's place was really cool! Good...,"[my, stay, islam, 's, place, realli, cool, goo...","[(my, PRP$), (stay, NN), (islam, NN), ('s, POS..."
1,1178162,4869189,6452964,Charlotte,Great location for both airport and city - gre...,"[great, locat, airport, citi, great, amen, hou...","[(great, JJ), (locat, JJ), (airport, NN), (cit..."
2,1178162,5003196,6449554,Sebastian,We really enjoyed our stay at Islams house. Fr...,"[we, realli, enjoy, stay, islam, hous, from, o...","[(we, PRP), (realli, VBP), (enjoy, JJ), (stay,..."
3,1178162,5150351,2215611,Marine,The room was nice and clean and so were the co...,"[the, room, nice, clean, commod, veri, close, ...","[(the, DT), (room, NN), (nice, JJ), (clean, JJ..."
4,1178162,5171140,6848427,Andrew,Great location. Just 5 mins walk from the Airp...,"[great, locat, just, 5, min, walk, airport, st...","[(great, JJ), (locat, NN), (just, RB), (5, CD)..."


In [10]:
ds['tagged'][0]

[('my', 'PRP$'),
 ('stay', 'NN'),
 ('islam', 'NN'),
 ("'s", 'POS'),
 ('place', 'NN'),
 ('realli', 'NN'),
 ('cool', 'NN'),
 ('good', 'JJ'),
 ('locat', 'NN'),
 ('5min', 'CD'),
 ('away', 'RB'),
 ('subway', 'RB'),
 ('10min', 'CD'),
 ('downtown', 'IN'),
 ('the', 'DT'),
 ('room', 'NN'),
 ('nice', 'JJ'),
 ('place', 'NN'),
 ('clean', 'JJ'),
 ('islam', 'NN'),
 ('manag', 'NN'),
 ('pretti', 'RB'),
 ('well', 'RB'),
 ('arriv', 'RB'),
 ('even', 'RB'),
 ('last', 'JJ'),
 ('minut', 'NN'),
 ('recommand', 'NN'),
 ('place', 'NN'),
 ('airbnb', 'IN'),
 ('user', 'NN')]

In [37]:
def calculate_frequency(list_of_pos_tags):
    tags = []
    for word, tag in list_of_pos_tags:
        tags.append(tag)
    return nltk.FreqDist(tags).items()

ds['pos_tag_freq'] = ds['tagged'].apply(calculate_frequency)

In [None]:
ds

In [38]:
data_df = pa.DataFrame(ds['pos_tag_freq'].tolist())
data_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,"(PRP$, 1)","(NN, 14)","(POS, 1)","(JJ, 4)","(CD, 2)","(RB, 6)","(IN, 2)","(DT, 1)",,,...,,,,,,,,,,
1,"(JJ, 4)","(NN, 2)","(NNS, 2)","(VBP, 1)","(RB, 3)","(VB, 1)","(IN, 1)",,,,...,,,,,,,,,,
2,"(PRP, 2)","(VBP, 2)","(JJ, 10)","(NN, 19)","(RB, 4)","(IN, 3)","(VB, 3)","(DT, 2)","(CD, 1)","(CC, 1)",...,,,,,,,,,,
3,"(DT, 1)","(NN, 13)","(JJ, 4)","(VB, 2)","(MD, 1)",,,,,,...,,,,,,,,,,
4,"(JJ, 4)","(NN, 9)","(RB, 1)","(CD, 1)",,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68217,"(NN, 36)","(JJ, 4)","(IN, 1)","(VBD, 1)","(VBZ, 1)",,,,,,...,,,,,,,,,,
68218,"(JJ, 13)","(NN, 42)","(RB, 4)","(VBD, 2)","(VBP, 8)","(:, 2)","(MD, 1)","(VBN, 1)","(POS, 2)","(DT, 1)",...,,,,,,,,,,
68219,"(DT, 3)","(NN, 27)","(JJ, 7)","(VBD, 2)","(RB, 4)","(VBZ, 2)","(NNS, 1)","(VBP, 2)","(CD, 1)","(VB, 3)",...,,,,,,,,,,
68220,"(VB, 4)","(JJR, 1)","(NN, 81)","(POS, 3)","(VBP, 11)","(VBD, 5)","(CD, 7)","(JJ, 25)","(VBZ, 3)","(DT, 4)",...,"(MD, 1)","(PRP, 1)",,,,,,,,


3.
- (a) Calculate the frequency of each POS tag in the dataset.
- (b) Identify and print the most common POS tags and their frequencies.  (2Marks)

4. Generate visualizations such as bar charts, word clouds to represent the POS tag frequencies                                 (1 Mark)

5. Display the HMM POS tagging on the first 4 rows of the dataset            (2 Marks)