# Making necessary imports

In [1]:
import string
import pandas as pd

In [2]:
# NLP library imports
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/elliot/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/elliot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/elliot/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Applying the transformation we've seen to our dataset

In [4]:
# Loading dataFrame
df = pd.read_csv(r'cryptodata_test_preprocessed.csv') # requires that the previous step of pre-processing has worked!

# Preparing transformations for preprocessing function
caracters_to_remove = list(string.punctuation)
transformation_car_dict = {initial:" " for initial in caracters_to_remove}

with_accent = ['é', 'è', 'ê', 'à', 'ù', 'ç', 'ô', 'î']
without_accent = ['e', 'e', 'e', 'a', 'u', 'c', 'o', 'i']
transformation_accent_dict = {before:after for before, after in zip(with_accent, without_accent)}

stopW = stopwords.words('french')
stopW += ['les', 'a', 'tout']


# Preprocessing function to apply to the content column
def preprocessing(review):
  
    # Tokenization
    tokens = word_tokenize(review)
    
    # Deleting words with  only one caracter
    tokens = [token for token in tokens if len(token)>2]
    
    # stopwords + lowercase
    tokens = [token.lower() for token in tokens if token.lower() not in stopW]   
    
    # Removing accents
    tokens = [token.translate(str.maketrans(transformation_accent_dict)) for token in tokens]
    
    # Deleting specific caracters
    tokens = [token.translate(str.maketrans(transformation_car_dict)) for token in tokens]
        
    return tokens
  

# Creating a new column swith tokenized reviews
df['tokens'] = df['body'].apply(preprocessing)

# Displaying part of the result
df.tail()

TypeError: expected string or bytes-like object

# Discovering [Stemming](https://en.wikipedia.org/wiki/Stemming) and [Lemmatisation](https://en.wikipedia.org/wiki/Lemmatisation)


If you want to understand how the [Porter Algorithm](https://fr.wikipedia.org/wiki/Racinisation#Algorithme_de_Porter) works.

In [67]:
#create Stemmer objects
porter = PorterStemmer()
lancaster=LancasterStemmer()

## Visualizing the effects of two different stemmers on basic words

In [38]:
#A list of words to be stemmed
word_list = ["friend", "friendship", "friends", "friendships","stabil","destabilize","misunderstanding","railroad","moonlight","football"]
print("{0:20}{1:20}{2:20}".format("Word","Porter Stemmer","lancaster Stemmer"))
for word in word_list:
    print("{0:20}{1:20}{2:20}".format(word,porter.stem(word),lancaster.stem(word)))

Word                Porter Stemmer      lancaster Stemmer   
friend              friend              friend              
friendship          friendship          friend              
friends             friend              friend              
friendships         friendship          friend              
stabil              stabil              stabl               
destabilize         destabil            dest                
misunderstanding    misunderstand       misunderstand       
railroad            railroad            railroad            
moonlight           moonlight           moonlight           
football            footbal             footbal             


## Effects on a total sentence

In [68]:
def stemSentence(sentence, stemmer):
    
    token_words = word_tokenize(sentence)
    stem_sentence = []
    
    for word in token_words:
        stem_sentence.append(stemmer.stem(word))
        stem_sentence.append(" ")
    
    return "".join(stem_sentence)

In [40]:
# And compare differences
sentence="Pythoners are very intelligent and work very pythonly and now they are pythoning their way to success."

print(stemSentence(sentence, lancaster))
print(stemSentence(sentence, porter))

python ar very intellig and work very python and now they ar python their way to success . 
python are veri intellig and work veri pythonli and now they are python their way to success . 


In [41]:
# Look at what is happening on a french sentence
sentence="Ce matin je suis allé acheter une galette à la boulangerie puis je me suis régalé avant de venir en cours."

print(stemSentence(sentence, lancaster))
print(stemSentence(sentence, porter))

ce matin je sui allé achet un galet à la boulangery pui je me sui régalé av de venir en cour . 
Ce matin je sui allé achet une galett à la boulangeri pui je me sui régalé avant de venir en cour . 


## A stemmer to use on different languages (for example french..)

In [69]:
def frenchStemSentence(sentence):
    frenchStemmer=SnowballStemmer("french", ignore_stopwords=False)
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(frenchStemmer.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

frenchStemSentence("cette phrase est à la fois amusante et surprenante")

'cet phras est à la fois amus et surpren '

## Having a look at lemmatization

In [70]:
# Initiate lemmatizer
lemmatizer = WordNetLemmatizer()

# Create lematizing function
def lemmatize(sentence):
    tokens=word_tokenize(sentence)
    tokens = [lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(token,pos='a'),pos='v'),pos='n') for token in tokens]
    return " ".join(tokens)

# And display results
lemmer = lemmatize("Such an analysis can reveal features that are not easily visible from the variations in the individual genes and can lead to a picture of expression that is more biologically transparent")

In [60]:
lemmer

'Such an analysis can reveal feature that be not easily visible from the variation in the individual gene and can lead to a picture of expression that be more biologically transparent'

# Applying one of those modification to our dataset

 **Preparing both functions**

In [71]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize(tokens):
    tokens = [lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(token,pos='a'),pos='v'),pos='n') for token in tokens]
    return tokens  

# Stemming
frenchStemmer=SnowballStemmer("french")
def stem(tokens):
    tokens = [frenchStemmer.stem(token) for token in tokens]
    return tokens  


**Selecting which one to apply, given the language used in your reviews**

In [72]:
# Are your reviews in English ? (here it is unfortunately not the case)
english = False

**And finally applying it to our dataFrame**

In [73]:
# Making appropriate modification
if english:
    df['review_lemmatized'] = df['tokens'].apply(lemmatize)

else:
    df['review_lemmatized'] = df['tokens'].apply(stem)

# And displaying results
df.head()

Unnamed: 0,hotel_name,published_date,rating,review,language,title,trip_date,tokens,review_lemmatized
0,Center Parcs Le Lac d'Ailette,27 septembre 2018,4,"Après une désastreuse aventure au Bois franc, ...",fr,très bon week end,août 2018,"[apres, desastreuse, aventure, bois, franc, qu...","[apre, desastr, aventur, bois, franc, quel, pl..."
1,Center Parcs Le Lac d'Ailette,18 janvier 2019,5,"Ambiance détendue , une vraie déconnexion. Le...",fr,"Séjour agréable comme toujours ,une vraie déco...",mars 2018,"[ambiance, detendue, vraie, deconnexion, logem...","[ambianc, detendu, vrai, deconnexion, log, pro..."
2,Center Parcs Le Lac d'Ailette,11 novembre 2018,3,Première fois que nous allions à center Parcs ...,fr,"3,5 serait plus juste",juillet 2018,"[premiere, fois, allions, center, parcs, ailet...","[premier, fois, allion, cent, parc, ailet, pre..."
3,Center Parcs Le Lac d'Ailette,3 octobre 2018,4,Génial pour les enfants petits et grands ! Ple...,fr,Endroit sympathique,avril 2018,"[genial, enfants, petits, grands, pleins, d ac...","[genial, enfant, petit, grand, plein, d activi..."
4,Center Parcs Le Lac d'Ailette,17 janvier 2019,2,Nous avons fait une réservation avec notre CE ...,fr,Réservation février 2019,janvier 2019,"[fait, reservation, novembre, 2018, cottage, p...","[fait, reserv, novembr, 2018, cottag, person, ..."


# Final modification

In [74]:
# Why not doing the same on title 
df['title_lemmatized'] = df['title'].apply(preprocessing).apply(stem)
df.reset_index(drop = True, inplace = True)



In [None]:
# Finally keeping only necessary columns
del(df['content'])
del(df['tokens'])



In [49]:
df.head(10)


Unnamed: 0,hotel_name,published_date,rating,review,language,title,trip_date,tokens,review_lemmatized,title_lemmatized
0,Center Parcs Le Lac d'Ailette,27 septembre 2018,4,"Après une désastreuse aventure au Bois franc, ...",fr,très bon week end,août 2018,"[apres, desastreuse, aventure, bois, franc, qu...","[apre, desastr, aventur, bois, franc, quel, pl...","[tre, bon, week, end]"
1,Center Parcs Le Lac d'Ailette,18 janvier 2019,5,"Ambiance détendue , une vraie déconnexion. Le...",fr,"Séjour agréable comme toujours ,une vraie déco...",mars 2018,"[ambiance, detendue, vraie, deconnexion, logem...","[ambianc, detendu, vrai, deconnexion, log, pro...","[sejour, agreabl, comm, toujour, vrai, deconne..."
2,Center Parcs Le Lac d'Ailette,11 novembre 2018,3,Première fois que nous allions à center Parcs ...,fr,"3,5 serait plus juste",juillet 2018,"[premiere, fois, allions, center, parcs, ailet...","[premier, fois, allion, cent, parc, ailet, pre...","[3 5, plus, just]"
3,Center Parcs Le Lac d'Ailette,3 octobre 2018,4,Génial pour les enfants petits et grands ! Ple...,fr,Endroit sympathique,avril 2018,"[genial, enfants, petits, grands, pleins, d ac...","[genial, enfant, petit, grand, plein, d activi...","[endroit, sympath]"
4,Center Parcs Le Lac d'Ailette,17 janvier 2019,2,Nous avons fait une réservation avec notre CE ...,fr,Réservation février 2019,janvier 2019,"[fait, reservation, novembre, 2018, cottage, p...","[fait, reserv, novembr, 2018, cottag, person, ...","[reserv, fevri, 2019]"
5,Center Parcs Le Lac d'Ailette,"August 31, 2018",4,"This was the third European CP we've visited, ...",en,"Great holiday, but too many cars on site",August 2018,"[this, was, the, third, european, ve, visited...","[this, was, the, third, european, ve, visited...","[great, holiday, but, too, many, car, sit]"
6,Center Parcs Le Lac d'Ailette,"September 1, 2018",5,I have to admit after reading so many negative...,en,Supassed expectations,August 2018,"[have, admit, after, reading, many, negative, ...","[hav, admit, after, reading, many, negat, revi...","[supassed, expect]"
7,Center Parcs Le Lac d'Ailette,"August 31, 2018",3,they have good amenities but the cottages are ...,en,good amenities but the cottages are not clean,August 2018,"[they, have, good, amenities, but, the, cottag...","[they, hav, good, amenit, but, the, cottag, ar...","[good, amenit, but, the, cottag, are, not, clean]"
8,Center Parcs Le Lac d'Ailette,"September 1, 2018",3,Pool was fun but there was overcrowding at tim...,en,Not enamoured with Le Lac D'Ailette,August 2018,"[pool, was, fun, but, there, was, overcrowding...","[pool, was, fun, but, ther, was, overcrowding,...","[not, enamoured, with, lac, d ailet]"
9,Center Parcs Le Lac d'Ailette,"September 1, 2018",2,Our third European Centre Parcs. The location...,en,Family holiday - children 10 and 7,August 2018,"[our, third, european, centre, parcs, the, loc...","[our, third, european, centr, parc, the, locat...","[family, holiday, children, and]"


In [78]:
df.to_csv (r'C:\Users\Ellio\Desktop\clean_data_group_5.csv', index=False, encoding = 'utf8')

In [75]:
df['review'][0]

"Après une désastreuse aventure au Bois franc, quel plaisir de retourner au lac d'Ailette..en plus les cottages ont été rénovés avec goût, Partis début août sous la canicule, nous avons apprécié les plaisirs de l'aquamondo malgré l'affluence,(peut être faudrait-il que les personnes ne séjournant pas dans le parc n'aient plus accès aux installations quand celui-ci est déjà complet - 1700 personnes dans la piscine c'est vraiment beaucoup trop)Nous avons loué un cottage premium bord de lac . Nous avons aimés nos soirées tranquilles sur la terrasse; beaucoup moins la chaleur dans le cottage (ventilation à prévoir)  celui-ci était très propre à notre arrivée et les quelques soucis rencontrés (pas de pile dans la télécommande, sèche-linge cassé) ont été très vite réglés par un personnel compétant  De quoi bien commencer notre séjour. Un bel endroit où nous reviendrons certainement l'été prochain"