In [1]:
import pandas as pd
import numpy as np 
import re
# nltk.download('stopwords') 
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 
# nltk.download('wordnet')
from nltk.tokenize import word_tokenize 

ps = PorterStemmer()
lem = WordNetLemmatizer()

In [2]:
skiAreaReviewDF = pd.read_csv('OnTheSnow_SkiAreaReviews.csv')
skiAreaReviewDF.head()

Unnamed: 0,State,Ski Area,Reviewer Name,Review Date,Review Star Rating (out of 5),Review Text
0,california,squaw-valley-usa,philip sayles,31st December 2019,3,I'm glad our family experienced Squaw but I wo...
1,california,squaw-valley-usa,mateonelson,22nd May 2019,4,"I went skiing today 5/22, granite chief was am..."
2,california,squaw-valley-usa,,7th March 2019,2,We had a horrible experience on our family ski...
3,california,squaw-valley-usa,Ivan Cazares,5th March 2019,3,This is the first year I ski Squaw. I've been ...
4,california,squaw-valley-usa,welzbob,26th February 2019,3,"Both Squaw and Alpine have incredible terrain,..."


In [3]:
skiAreaReviewDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18262 entries, 0 to 18261
Data columns (total 6 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   State                          18262 non-null  object
 1   Ski Area                       18262 non-null  object
 2   Reviewer Name                  18153 non-null  object
 3   Review Date                    18262 non-null  object
 4   Review Star Rating (out of 5)  18262 non-null  int64 
 5   Review Text                    18250 non-null  object
dtypes: int64(1), object(5)
memory usage: 856.2+ KB


## Completeness of our data

By making a call to .info() we can see that while every reviewer left a rating, 
we have an unequal number of reviews where text was provided by the reviewer. 

## Data Cleanup
Next lets perform a few steps to get the text for each review cleaned up and ready for Natural Language Processing. 
First we will define a function that removes any values that are not characters and changes the text to lower case 
all of this is performed using a regular expression. 

In [4]:
skiAreaReviewDF = skiAreaReviewDF.loc[:, ['Review Text', 'Review Star Rating (out of 5)']]
skiAreaReviewDF = skiAreaReviewDF[skiAreaReviewDF['Review Text'].notnull()]

In [5]:
skiAreaReviewDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18250 entries, 0 to 18261
Data columns (total 2 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Review Text                    18250 non-null  object
 1   Review Star Rating (out of 5)  18250 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 427.7+ KB


In [6]:
skiAreaReviewDF['Review Text'][7332]

"Best skiing in the Midwest! We've skied all over Northern Michigan and Minnesota and found this to be the best skiing.  Great long runs!  Excellent service.  My daughter snapped the binding on the skis she owns.  The guy in the ski rental shop repaired it in 10 mins at NO charge and she was back on the slopes.   Fantastic new gondola -great scenery and views of Lake Superior - can't get that out west!"

In [7]:
#https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractionsDict = pd.read_csv('ContractionsDict.csv').set_index('Key')['Value'].to_dict()

In [8]:
def contractionsHandler(word, contractionsDict):
    splitWord = word.split()
    tempList = []
    for contraction in splitWord:
        if contraction.lower() in contractionsDict.keys():
            uncontracted = contractionsDict[contraction.lower()]
            tempList.append(uncontracted)
        else:
            tempList.append(contraction)
    return ' '.join(tempList)

In [9]:
#contractionsDict

#contractionsDict["i've"]

In [10]:
skiAreaReviewDF['Clean Reviews'] = skiAreaReviewDF['Review Text'].apply(lambda x: contractionsHandler(str(x), contractionsDict))

In [11]:
skiAreaReviewDF['Clean Reviews'][7332]

'Best skiing in the Midwest! we have skied all over Northern Michigan and Minnesota and found this to be the best skiing. Great long runs! Excellent service. My daughter snapped the binding on the skis she owns. The guy in the ski rental shop repaired it in 10 mins at NO charge and she was back on the slopes. Fantastic new gondola -great scenery and views of Lake Superior - cannot get that out west!'

In [12]:
def replacer(x):
    x = re.sub("[^\'a-zA-Z]", ' ', x) 
    x = re.sub(r"\s+", ' ', x) 
    x = re.sub(r"\n", ' ', x) 
    x = re.sub("'", '', x) 
    x = x.lower()
    return x

In [13]:
skiAreaReviewDF['Clean Reviews'] = skiAreaReviewDF['Clean Reviews'].apply(lambda x: replacer(str(x)))

In [14]:
skiAreaReviewDF['Clean Reviews'][7332]

'best skiing in the midwest we have skied all over northern michigan and minnesota and found this to be the best skiing great long runs excellent service my daughter snapped the binding on the skis she owns the guy in the ski rental shop repaired it in mins at no charge and she was back on the slopes fantastic new gondola great scenery and views of lake superior cannot get that out west '

In [15]:
skiAreaReviewDF[skiAreaReviewDF['Clean Reviews'] == ' '].head()


Unnamed: 0,Review Text,Review Star Rating (out of 5),Clean Reviews
8,",",5,
10,",",5,
1790,.\r,1,
1827,.\r,1,
3560,.,4,


In [16]:
skiAreaReviewDF = skiAreaReviewDF[skiAreaReviewDF['Clean Reviews'] != ' '] 

In [17]:
skiAreaReviewDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18222 entries, 0 to 18261
Data columns (total 3 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Review Text                    18222 non-null  object
 1   Review Star Rating (out of 5)  18222 non-null  int64 
 2   Clean Reviews                  18222 non-null  object
dtypes: int64(1), object(2)
memory usage: 569.4+ KB


In [18]:
def wordCounter(x):
    return len(x.split())

In [19]:
skiAreaReviewDF['Word Count'] = skiAreaReviewDF['Clean Reviews'].apply(lambda x: wordCounter(x))

In [20]:
skiAreaReviewDF = skiAreaReviewDF[skiAreaReviewDF['Word Count'] > 1]

In [21]:
skiAreaReviewDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18041 entries, 0 to 18261
Data columns (total 4 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Review Text                    18041 non-null  object
 1   Review Star Rating (out of 5)  18041 non-null  int64 
 2   Clean Reviews                  18041 non-null  object
 3   Word Count                     18041 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 704.7+ KB


In [22]:
def stemmer(x, ps):
    x = word_tokenize(x)
    x = [ps.stem(word) for word in x
                if not word in set(stopwords.words('english')) and word is not np.nan]
    x = ' '.join(x)
    return x

In [23]:
skiAreaReviewDF['Clean Reviews Stemmed'] = skiAreaReviewDF['Clean Reviews'].apply(lambda x: stemmer(x, ps))

In [24]:
skiAreaReviewDF['Clean Reviews Stemmed'][7332]

'best ski midwest ski northern michigan minnesota found best ski great long run excel servic daughter snap bind ski own guy ski rental shop repair min charg back slope fantast new gondola great sceneri view lake superior get west'

In [25]:
def lemmer(x, lem):
    x = word_tokenize(x)
    x = [lem.lemmatize(word) for word in x
                if not word in set(stopwords.words('english')) and word is not np.nan]
    x = ' '.join(x)
    return x

In [26]:
skiAreaReviewDF['Clean Reviews Lemmed'] = skiAreaReviewDF['Clean Reviews'].apply(lambda x: lemmer(x, lem))

In [27]:
skiAreaReviewDF['Clean Reviews Lemmed'][7332]

'best skiing midwest skied northern michigan minnesota found best skiing great long run excellent service daughter snapped binding ski owns guy ski rental shop repaired min charge back slope fantastic new gondola great scenery view lake superior get west'

In [28]:
skiAreaReviewDF['Clean Reviews Lemmed and Stemmed'] = skiAreaReviewDF['Clean Reviews Lemmed'].apply(lambda x: stemmer(x, ps))

In [29]:
skiAreaReviewDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18041 entries, 0 to 18261
Data columns (total 7 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   Review Text                       18041 non-null  object
 1   Review Star Rating (out of 5)     18041 non-null  int64 
 2   Clean Reviews                     18041 non-null  object
 3   Word Count                        18041 non-null  int64 
 4   Clean Reviews Stemmed             18041 non-null  object
 5   Clean Reviews Lemmed              18041 non-null  object
 6   Clean Reviews Lemmed and Stemmed  18041 non-null  object
dtypes: int64(2), object(5)
memory usage: 1.7+ MB


In [32]:
skiAreaReviewDF['Clean Reviews Lemmed and Stemmed'][7332]

'best ski midwest ski northern michigan minnesota found best ski great long run excel servic daughter snap bind ski own guy ski rental shop repair min charg back slope fantast new gondola great sceneri view lake superior get west'

In [34]:
skiAreaReviewDF.to_csv('ProcessedReviews.csv', index=False)