# Making necessary imports

In [5]:
import string
import pandas as pd

In [6]:
# NLP library imports
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/elliot/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/elliot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/elliot/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Applying the transformation we've seen to our dataset

In [11]:
# Loading dataFrame
df = pd.read_csv(r'cryptodata_test_preprocessed.csv') # requires that the previous step of pre-processing has worked!

# Preparing transformations for preprocessing function
caracters_to_remove = list(string.punctuation)
transformation_car_dict = {initial:" " for initial in caracters_to_remove}

with_accent = ['é', 'è', 'ê', 'à', 'ù', 'ç', 'ô', 'î']
without_accent = ['e', 'e', 'e', 'a', 'u', 'c', 'o', 'i']
transformation_accent_dict = {before:after for before, after in zip(with_accent, without_accent)}

stopW = stopwords.words('french')
stopW += ['les', 'a', 'tout']


# Preprocessing function to apply to the content column
def preprocessing(review):
  
    # Tokenization
    tokens = word_tokenize(review)
    
    # Deleting words with  only one caracter
    tokens = [token for token in tokens if len(token)>2]
    
    # stopwords + lowercase
    tokens = [token.lower() for token in tokens if token.lower() not in stopW]   
    
    # Removing accents
    tokens = [token.translate(str.maketrans(transformation_accent_dict)) for token in tokens]
    
    # Deleting specific caracters
    tokens = [token.translate(str.maketrans(transformation_car_dict)) for token in tokens]
        
    return tokens
  

# Creating a new column swith tokenized reviews
df['tokens'] = df['body'].apply(str).apply(preprocessing)


# Displaying part of the result
df.tail()

Unnamed: 0,time,body,stopword_reviews,body_as_str,tokens
495,1543420943,The new broker-dealer and end-user oriented pr...,"['new', 'broker', 'dealer', 'end', 'user', 'or...",The new broker-dealer and end-user oriented pr...,"[the, new, broker dealer, and, end user, orien..."
496,1543420851,"Tapatalk, the mobile forum application with an...","['tapatalk', 'mobile', 'forum', 'application',...","Tapatalk, the mobile forum application with an...","[tapatalk, the, mobile, forum, application, wi..."
497,1543420803,SEC indecision has led companies including sev...,"['sec', 'indecision', 'led', 'companies', 'inc...",SEC indecision has led companies including sev...,"[sec, indecision, has, led, companies, includi..."
498,1543420803,The chairman of the U.S. Securities and Exchan...,"['chairman', 'u', 'securities', 'exchange', 'c...",The chairman of the U.S. Securities and Exchan...,"[the, chairman, the, u s , securities, and, ex..."
499,1543420552,"KNOMAD, the Global Knowledge Partnership onRea...","['knomad', 'global', 'knowledge', 'partnership...","KNOMAD, the Global Knowledge Partnership onRea...","[knomad, the, global, knowledge, partnership, ..."


# Discovering [Stemming](https://en.wikipedia.org/wiki/Stemming) and [Lemmatisation](https://en.wikipedia.org/wiki/Lemmatisation)


If you want to understand how the [Porter Algorithm](https://fr.wikipedia.org/wiki/Racinisation#Algorithme_de_Porter) works.

In [12]:
#create Stemmer objects
porter = PorterStemmer()
lancaster=LancasterStemmer()

## Visualizing the effects of two different stemmers on basic words

In [13]:
#A list of words to be stemmed
word_list = ["friend", "friendship", "friends", "friendships","stabil","destabilize","misunderstanding","railroad","moonlight","football"]
print("{0:20}{1:20}{2:20}".format("Word","Porter Stemmer","lancaster Stemmer"))
for word in word_list:
    print("{0:20}{1:20}{2:20}".format(word,porter.stem(word),lancaster.stem(word)))

Word                Porter Stemmer      lancaster Stemmer   
friend              friend              friend              
friendship          friendship          friend              
friends             friend              friend              
friendships         friendship          friend              
stabil              stabil              stabl               
destabilize         destabil            dest                
misunderstanding    misunderstand       misunderstand       
railroad            railroad            railroad            
moonlight           moonlight           moonlight           
football            footbal             footbal             


## Effects on a total sentence

In [14]:
def stemSentence(sentence, stemmer):
    
    token_words = word_tokenize(sentence)
    stem_sentence = []
    
    for word in token_words:
        stem_sentence.append(stemmer.stem(word))
        stem_sentence.append(" ")
    
    return "".join(stem_sentence)

In [15]:
# And compare differences
sentence="Pythoners are very intelligent and work very pythonly and now they are pythoning their way to success."

print(stemSentence(sentence, lancaster))
print(stemSentence(sentence, porter))

python ar very intellig and work very python and now they ar python their way to success . 
python are veri intellig and work veri pythonli and now they are python their way to success . 


In [16]:
# Look at what is happening on a french sentence
sentence="Ce matin je suis allé acheter une galette à la boulangerie puis je me suis régalé avant de venir en cours."

print(stemSentence(sentence, lancaster))
print(stemSentence(sentence, porter))

ce matin je sui allé achet un galet à la boulangery pui je me sui régalé av de venir en cour . 
Ce matin je sui allé achet une galett à la boulangeri pui je me sui régalé avant de venir en cour . 


## A stemmer to use on different languages (for example french..)

In [17]:
def frenchStemSentence(sentence):
    frenchStemmer=SnowballStemmer("french", ignore_stopwords=False)
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(frenchStemmer.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

frenchStemSentence("cette phrase est à la fois amusante et surprenante")

'cet phras est à la fois amus et surpren '

## Having a look at lemmatization

In [18]:
# Initiate lemmatizer
lemmatizer = WordNetLemmatizer()

# Create lematizing function
def lemmatize(sentence):
    tokens=word_tokenize(sentence)
    tokens = [lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(token,pos='a'),pos='v'),pos='n') for token in tokens]
    return " ".join(tokens)

# And display results
lemmer = lemmatize("Such an analysis can reveal features that are not easily visible from the variations in the individual genes and can lead to a picture of expression that is more biologically transparent")

In [19]:
lemmer

'Such an analysis can reveal feature that be not easily visible from the variation in the individual gene and can lead to a picture of expression that be more biologically transparent'

# Applying one of those modification to our dataset

 **Preparing both functions**

In [20]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize(tokens):
    tokens = [lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(token,pos='a'),pos='v'),pos='n') for token in tokens]
    return tokens  

# Stemming
frenchStemmer=SnowballStemmer("french")
def stem(tokens):
    tokens = [frenchStemmer.stem(token) for token in tokens]
    return tokens  


**Selecting which one to apply, given the language used in your reviews**

In [21]:
# Are your reviews in English ? (here it is unfortunately not the case)
english = False

**And finally applying it to our dataFrame**

In [24]:
# Making appropriate modification
if english:
    df['body_lemmatized'] = df['tokens'].apply(lemmatize)

else:
    df['body_lemmatized'] = df['tokens'].apply(stem)

# And displaying results
df.head()

Unnamed: 0,time,body,stopword_reviews,body_as_str,tokens,review_lemmatized,body_lemmatized
0,1543622368,CoinSpeakerProBit: Professional Digital Curren...,"['coinspeakerprobit', 'professional', 'digital...",CoinSpeakerProBit: Professional Digital Curren...,"[coinspeakerprobit, professional, digital, cur...","[coinspeakerprob, professional, digital, curre...","[coinspeakerprob, professional, digital, curre..."
1,1543620634,Today in crypto heard rumors of Satoshi's retu...,"['today', 'crypto', 'heard', 'rumors', 'satosh...",Today in crypto heard rumors of Satoshi's retu...,"[today, crypto, heard, rumors, satoshi, return...","[today, crypto, heard, rumor, satosh, return, ...","[today, crypto, heard, rumor, satosh, return, ..."
2,1543620300,Crashes in the crypto market resulted in spike...,"['crashes', 'crypto', 'market', 'resulted', 's...",Crashes in the crypto market resulted in spike...,"[crashes, the, crypto, market, resulted, spike...","[crash, the, crypto, market, resulted, spik, m...","[crash, the, crypto, market, resulted, spik, m..."
3,1543620202,SEC Chairman Jay Clayton has claimed that bitc...,"['sec', 'chairman', 'jay', 'clayton', 'claimed...",SEC Chairman Jay Clayton has claimed that bitc...,"[sec, chairman, jay, clayton, has, claimed, th...","[sec, chairman, jay, clayton, has, claimed, th...","[sec, chairman, jay, clayton, has, claimed, th..."
4,1543618813,There are four different Ethereum working grou...,"['four', 'different', 'ethereum', 'working', '...",There are four different Ethereum working grou...,"[there, are, four, different, ethereum, workin...","[ther, are, four, different, ethereum, working...","[ther, are, four, different, ethereum, working..."


# Final modification

In [74]:
# Why not doing the same on title 
df['title_lemmatized'] = df['title'].apply(preprocessing).apply(stem)
df.reset_index(drop = True, inplace = True)



In [None]:
# Finally keeping only necessary columns
del(df['content'])
del(df['tokens'])



In [26]:
df.head(10)


Unnamed: 0,time,body,stopword_reviews,body_as_str,tokens,review_lemmatized,body_lemmatized
0,1543622368,CoinSpeakerProBit: Professional Digital Curren...,"['coinspeakerprobit', 'professional', 'digital...",CoinSpeakerProBit: Professional Digital Curren...,"[coinspeakerprobit, professional, digital, cur...","[coinspeakerprob, professional, digital, curre...","[coinspeakerprob, professional, digital, curre..."
1,1543620634,Today in crypto heard rumors of Satoshi's retu...,"['today', 'crypto', 'heard', 'rumors', 'satosh...",Today in crypto heard rumors of Satoshi's retu...,"[today, crypto, heard, rumors, satoshi, return...","[today, crypto, heard, rumor, satosh, return, ...","[today, crypto, heard, rumor, satosh, return, ..."
2,1543620300,Crashes in the crypto market resulted in spike...,"['crashes', 'crypto', 'market', 'resulted', 's...",Crashes in the crypto market resulted in spike...,"[crashes, the, crypto, market, resulted, spike...","[crash, the, crypto, market, resulted, spik, m...","[crash, the, crypto, market, resulted, spik, m..."
3,1543620202,SEC Chairman Jay Clayton has claimed that bitc...,"['sec', 'chairman', 'jay', 'clayton', 'claimed...",SEC Chairman Jay Clayton has claimed that bitc...,"[sec, chairman, jay, clayton, has, claimed, th...","[sec, chairman, jay, clayton, has, claimed, th...","[sec, chairman, jay, clayton, has, claimed, th..."
4,1543618813,There are four different Ethereum working grou...,"['four', 'different', 'ethereum', 'working', '...",There are four different Ethereum working grou...,"[there, are, four, different, ethereum, workin...","[ther, are, four, different, ethereum, working...","[ther, are, four, different, ethereum, working..."
5,1543617903,"On Nov. 29, Sirin Labs announced the commercia...","['nov', '29', 'sirin', 'labs', 'announced', 'c...","On Nov. 29, Sirin Labs announced the commercia...","[nov , sirin, labs, announced, the, commercial...","[nov , sirin, lab, announced, the, commercial,...","[nov , sirin, lab, announced, the, commercial,..."
6,1543616700,"Switzerlands oldest university, the University...","['switzerlands', 'oldest', 'university', 'univ...","Switzerlands oldest university, the University...","[switzerlands, oldest, university, the, univer...","[switzerland, oldest, university, the, univers...","[switzerland, oldest, university, the, univers..."
7,1543616305,The newer HBUS cryptocurrency exchange has tak...,"['newer', 'hbus', 'cryptocurrency', 'exchange'...",The newer HBUS cryptocurrency exchange has tak...,"[the, newer, hbus, cryptocurrency, exchange, h...","[the, new, hbus, cryptocurrency, exchang, has,...","[the, new, hbus, cryptocurrency, exchang, has,..."
8,1543616240,Having met the U.S. Se...,"['met', 'u', 'securities', 'exchange', 'commis...",Having met the U.S. Se...,"[having, met, the, u s , securities, and, exch...","[having, met, the, u s , securit, and, exchang...","[having, met, the, u s , securit, and, exchang..."
9,1543615852,"The latest Ethereum 1x meeting, which was cond...","['latest', 'ethereum', '1x', 'meeting', 'condu...","The latest Ethereum 1x meeting, which was cond...","[the, latest, ethereum, meeting, which, was, c...","[the, latest, ethereum, meeting, which, was, c...","[the, latest, ethereum, meeting, which, was, c..."


In [25]:
df.to_csv (r'cryptodata_test_lemmatized', index=False, encoding = 'utf8')

In [75]:
df['review'][0]

"Après une désastreuse aventure au Bois franc, quel plaisir de retourner au lac d'Ailette..en plus les cottages ont été rénovés avec goût, Partis début août sous la canicule, nous avons apprécié les plaisirs de l'aquamondo malgré l'affluence,(peut être faudrait-il que les personnes ne séjournant pas dans le parc n'aient plus accès aux installations quand celui-ci est déjà complet - 1700 personnes dans la piscine c'est vraiment beaucoup trop)Nous avons loué un cottage premium bord de lac . Nous avons aimés nos soirées tranquilles sur la terrasse; beaucoup moins la chaleur dans le cottage (ventilation à prévoir)  celui-ci était très propre à notre arrivée et les quelques soucis rencontrés (pas de pile dans la télécommande, sèche-linge cassé) ont été très vite réglés par un personnel compétant  De quoi bien commencer notre séjour. Un bel endroit où nous reviendrons certainement l'été prochain"