In [None]:
# Install Dask because we are going to process a ton of data that Pandas would take a very long time to complete.

In [2]:
!pip install dask dask[dataframe]

Collecting dask-expr<1.1,>=1.0
  Downloading dask_expr-1.0.11-py3-none-any.whl (184 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m185.0/185.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m
[?25hCollecting pyarrow>=7.0.0
  Using cached pyarrow-15.0.2-cp310-cp310-macosx_10_15_x86_64.whl (27.2 MB)
Installing collected packages: pyarrow, dask-expr
Successfully installed dask-expr-1.0.11 pyarrow-15.0.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
#### RESTART THE KERNEL

Go to https://www.kaggle.com/datasets/ilhamfp31/yelp-review-dataset, download it and rename it to "evaluation_dataset.csv"

NOTE: Negative polarity is class 1, and positive class 2.

In [16]:
import pandas as pd

df = pd.read_csv("evaluation_dataset.csv",  names=['sentiment','text'], header=None)

In [17]:
df

Unnamed: 0,sentiment,text
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...
...,...,...
559995,2,Ryan was as good as everyone on yelp has claim...
559996,2,Professional \nFriendly\nOn time AND affordabl...
559997,1,Phone calls always go to voicemail and message...
559998,1,Looks like all of the good reviews have gone t...


In [18]:
# Lets create another column called 'truth'.
df['truth'] = ""

In [19]:
# Modify original sentiment values to be positive or negative instead of 1 or 2 for the truth category
# THIS WILL TAKE SOME TIME TO COMPLETE
for row in range(len(df)):
    
    if df['sentiment'].iloc[row] == 1:
        df['truth'].iloc[row] = 'negative'
    else:
        df['truth'].iloc[row] = 'positive'

# display it
df

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub data rate exceeded.
The Jupyter

Unnamed: 0,sentiment,text,truth
0,1,"Unfortunately, the frustration of being Dr. Go...",negative
1,2,Been going to Dr. Goldberg for over 10 years. ...,positive
2,1,I don't know what Dr. Goldberg was like before...,negative
3,1,I'm writing this review to give you a heads up...,negative
4,2,All the food is great here. But the best thing...,positive
...,...,...,...
559995,2,Ryan was as good as everyone on yelp has claim...,positive
559996,2,Professional \nFriendly\nOn time AND affordabl...,positive
559997,1,Phone calls always go to voicemail and message...,negative
559998,1,Looks like all of the good reviews have gone t...,negative


In [21]:
# Load the model
import xgboost as xgb

model = xgb.XGBClassifier(max_depth=10, n_estimators=1000, learning_rate=0.01)
model.load_model("model.json")

In [25]:
# Load our dataprocessing as we did in Session 3 for inference
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import pickle

# Download corpora (shit ton of text)
nltk.download('stopwords')
nltk.download('wordnet')

# English stop words here
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Load our vectorizer
loaded_vectorizer = pickle.load(open('vectorizer.pickle', 'rb'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/weston/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/weston/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [26]:
# our function to get predictions from our model
def predict(text):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text, re.UNICODE)
    # convert to lowercase
    text = text.lower()
    # Lemmatize
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")] 
    # remove stop words
    text = [word for word in text if not word in stop_words] 
    # Bring the list back into a string
    text = " ".join(text)

    # Vectorize from our vectorizer created above
    data_features = loaded_vectorizer.transform([text])
    # Create an array as it expects
    data_features = data_features.toarray()

    # Get the prediciton 
    prediction = model.predict(data_features)[0]

    # 1 is positive 0 is negative
    if prediction == 1:
        sentiment = 'positive'
    else: 
        sentiment = 'negative'

    return sentiment

In [35]:
import pandas as pd
import dask.dataframe as dd

# Convert pandas DataFrame to Dask DataFrame
# Trying to speed up the processing time!
ddf = dd.from_pandas(df, npartitions=2)

In [36]:
# Get the predictions by running the function
ddf['predictions'] = ddf.map_partitions(lambda df: df['text'].apply(predict),meta=('text', 'object'))

In [37]:
# Compute the result.
# THIS WILL TAKE SOME TIME
result = ddf.compute()
print(result)

        sentiment                                               text  \
0               1  Unfortunately, the frustration of being Dr. Go...   
1               2  Been going to Dr. Goldberg for over 10 years. ...   
2               1  I don't know what Dr. Goldberg was like before...   
3               1  I'm writing this review to give you a heads up...   
4               2  All the food is great here. But the best thing...   
...           ...                                                ...   
559995          2  Ryan was as good as everyone on yelp has claim...   
559996          2  Professional \nFriendly\nOn time AND affordabl...   
559997          1  Phone calls always go to voicemail and message...   
559998          1  Looks like all of the good reviews have gone t...   
559999          2  Ryan Rocks! I called him this morning for some...   

           truth predictions  
0       negative    negative  
1       positive    positive  
2       negative    negative  
3       neg

In [44]:
result.to_csv("evaluation_results.csv")

In [None]:
# Restart the Kernel

In [8]:
# Import the evaluation output back into pandas
import pandas as pd

df = pd.read_csv("evaluation_results.csv", index_col=0)

In [9]:
df

Unnamed: 0,sentiment,text,truth,predictions
0,1,"Unfortunately, the frustration of being Dr. Go...",negative,negative
1,2,Been going to Dr. Goldberg for over 10 years. ...,positive,positive
2,1,I don't know what Dr. Goldberg was like before...,negative,negative
3,1,I'm writing this review to give you a heads up...,negative,negative
4,2,All the food is great here. But the best thing...,positive,positive
...,...,...,...,...
559995,2,Ryan was as good as everyone on yelp has claim...,positive,positive
559996,2,Professional \nFriendly\nOn time AND affordabl...,positive,positive
559997,1,Phone calls always go to voicemail and message...,negative,positive
559998,1,Looks like all of the good reviews have gone t...,negative,negative


In [3]:
# Comparing 'truth' and 'predictions' columns. If model got it correct then it is True
df['correct'] = df['truth'] == df['predictions']

In [5]:
# Out of all results, how many correct?
true_count = df['correct'].sum()

# Print the count
print("Number of matches:", true_count)

Number of matches: 402490


In [7]:
# Percentage correct
print("Percentage correct:", true_count / len(df))


Percentage correct: 0.7187321428571428


In [None]:
 #Conclusion

From here you would do further analysis to figure out where or why your model is not getting desired results or if you think its good enough.

You may need to retrain your model with different params, check your training data. Evaluations like confustion matrix is a good place to start for something like this. 