# Track 1: Filtering the Noise: ML for Trustworthy Location Reviews
## Team ureca: Lee Hyunseung, Park Yumin, Yoon Hyukjin

## 1. ML Model Training

#### 0. Import Libraries and Dependencies

In [1]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE

import nltk
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


False

#### 1. Load Final Processed Data with Features and Ground Truth Labels (GPT generated)

In [3]:
df_train = pd.read_csv("final_data_featured_filtered.csv")
df_train

Unnamed: 0,A,B,D2,E,G,policy_label
0,0.151367,0.581653,0.660104,0.993421,0.999407,1.0
1,0.014161,0.565305,0.620909,0.360918,0.997275,1.0
2,0.009880,0.575038,0.551840,0.987322,0.999049,1.0
3,0.143038,0.575837,0.630236,0.859828,0.999422,1.0
4,0.384205,0.572793,0.792418,0.901759,0.999557,1.0
...,...,...,...,...,...,...
31115,0.007734,0.549692,0.500000,0.773126,0.999582,1.0
31116,0.078986,0.553054,0.576117,0.942918,0.999487,1.0
31117,0.009356,0.544649,0.500000,0.979812,0.957214,1.0
31118,0.012299,0.536573,0.500000,0.796173,0.999015,1.0


#### 2. Feature and Target Declaration

In [4]:
X = df_train[["A","B","D2","E","G"]]
y = df_train.policy_label

In [5]:
df_train.groupby("policy_label").count()

Unnamed: 0_level_0,A,B,D2,E,G
policy_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,3783,3783,3783,3783,3783
1.0,27337,27337,27337,27337,27337


#### 3. Use SMOTE for oversampling unbalanced train data

In [6]:
smote = SMOTE(sampling_strategy='auto', random_state=42) 
X_res, y_res = smote.fit_resample(X, y)

#### 4. Define Neural Network: Multi-layer Perceptron

In [7]:
model = Sequential([
    Input(shape=(5,)),
    
    Dense(64),
    BatchNormalization(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    
    Dense(32),
    BatchNormalization(),
    Dense(32, activation='relu'),
    Dropout(0.2),
    
    Dense(1, activation='sigmoid')
])

#### 5. Compile Model using the corresponding parameters

In [8]:
model.compile(optimizer=Adam(learning_rate=0.00001),
              loss='binary_crossentropy',
              metrics=
              [
                'accuracy',
                tf.keras.metrics.Precision(name='precision'),
                tf.keras.metrics.Recall(name='recall')
                ])

#### 6. Train the model

In [9]:
history = model.fit(
    X_res, y_res,
    epochs=100,
    batch_size=64,
    verbose=1
)

Epoch 1/100
[1m855/855[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 525us/step - accuracy: 0.5490 - loss: 0.7063 - precision: 0.5424 - recall: 0.6264
Epoch 2/100
[1m855/855[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 539us/step - accuracy: 0.5906 - loss: 0.6775 - precision: 0.5725 - recall: 0.7149
Epoch 3/100
[1m855/855[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 563us/step - accuracy: 0.6064 - loss: 0.6629 - precision: 0.5845 - recall: 0.7364
Epoch 4/100
[1m855/855[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 527us/step - accuracy: 0.6230 - loss: 0.6493 - precision: 0.5993 - recall: 0.7424
Epoch 5/100
[1m855/855[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 506us/step - accuracy: 0.6298 - loss: 0.6428 - precision: 0.6074 - recall: 0.7340
Epoch 6/100
[1m855/855[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 506us/step - accuracy: 0.6389 - loss: 0.6366 - precision: 0.6189 - recall: 0.7226
Epoch 7/100
[1m855/855[0m [32m━━━━━━━

## 2. Demonstration of Model Use Case

#### 1. Review and Metadata Preproecssing

In [10]:
from data_gpt_labeler.data_preprocessing_module import (
    preprocess_review_metadata, 
    preprocess_policy_C,
    preprocess_policy_D1,
    preprocess_policy_F
)

In [13]:
df_meta = pd.read_json("/Users/yumin/Downloads/meta-South_Dakota.json", lines=True)
df_review = pd.read_json("/Users/yumin/Downloads/review-South_Dakota_10.json", lines=True)

In [15]:
df_preprocessed = preprocess_review_metadata(df_meta=df_meta, df_review=df_review)
df_preprocessed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged_final["result"] = None


Unnamed: 0,rating,text,business_name,business_category,business_description,_id,result
0,1,Ordered a venti blonde salted caramel mocha a...,Starbucks,"[Coffee shop, Cafe, Coffee store, Espresso bar]",Seattle-based coffeehouse chain known for its ...,1.0507798098587045e+20_1598647132077,
1,3,"My experiences were okay, the drinks were good...",Starbucks,"[Coffee shop, Cafe, Coffee store, Espresso bar]",Seattle-based coffeehouse chain known for its ...,1.1267481698306823e+20_1607019739498,
2,5,Yummy,Starbucks,"[Coffee shop, Cafe, Coffee store, Espresso bar]",Seattle-based coffeehouse chain known for its ...,1.135714341354508e+20_1617357167910,
3,5,"Andrea is a pro! The computer was down, she wa...",Starbucks,"[Coffee shop, Cafe, Coffee store, Espresso bar]",Seattle-based coffeehouse chain known for its ...,1.0849846491577998e+20_1520781652933,
4,5,Consistantly good,Starbucks,"[Coffee shop, Cafe, Coffee store, Espresso bar]",Seattle-based coffeehouse chain known for its ...,1.1503982235122152e+20_1579458022307,
...,...,...,...,...,...,...,...
186527,5,Fun,Rushmore Tramway Adventures,"[Tourist attraction, Adventure sports]","Adventure park with chairlift, zip-line & tree...",1.1582266652005546e+20_1561943281610,
186528,5,Super fun,Rushmore Tramway Adventures,"[Tourist attraction, Adventure sports]","Adventure park with chairlift, zip-line & tree...",1.1257190083569926e+20_1540065452705,
186529,5,Awesome,Rushmore Tramway Adventures,"[Tourist attraction, Adventure sports]","Adventure park with chairlift, zip-line & tree...",1.1057734584587012e+20_1531431690487,
186530,5,Awesome,Rushmore Tramway Adventures,"[Tourist attraction, Adventure sports]","Adventure park with chairlift, zip-line & tree...",1.1081098441501137e+20_1528673082548,


#### 2. Rule-Based Filtering (Policy C, D1, F)

In [16]:
df_preprocessed_C = preprocess_policy_C(df_preprocessed)
df_preprocessed_C

Unnamed: 0,rating,text,business_name,business_category,business_description,_id,result
0,1,Ordered a venti blonde salted caramel mocha a...,Starbucks,"[Coffee shop, Cafe, Coffee store, Espresso bar]",Seattle-based coffeehouse chain known for its ...,1.0507798098587045e+20_1598647132077,
1,3,"My experiences were okay, the drinks were good...",Starbucks,"[Coffee shop, Cafe, Coffee store, Espresso bar]",Seattle-based coffeehouse chain known for its ...,1.1267481698306823e+20_1607019739498,
2,5,Yummy,Starbucks,"[Coffee shop, Cafe, Coffee store, Espresso bar]",Seattle-based coffeehouse chain known for its ...,1.135714341354508e+20_1617357167910,0
3,5,"Andrea is a pro! The computer was down, she wa...",Starbucks,"[Coffee shop, Cafe, Coffee store, Espresso bar]",Seattle-based coffeehouse chain known for its ...,1.0849846491577998e+20_1520781652933,
4,5,Consistantly good,Starbucks,"[Coffee shop, Cafe, Coffee store, Espresso bar]",Seattle-based coffeehouse chain known for its ...,1.1503982235122152e+20_1579458022307,0
...,...,...,...,...,...,...,...
186527,5,Fun,Rushmore Tramway Adventures,"[Tourist attraction, Adventure sports]","Adventure park with chairlift, zip-line & tree...",1.1582266652005546e+20_1561943281610,0
186528,5,Super fun,Rushmore Tramway Adventures,"[Tourist attraction, Adventure sports]","Adventure park with chairlift, zip-line & tree...",1.1257190083569926e+20_1540065452705,0
186529,5,Awesome,Rushmore Tramway Adventures,"[Tourist attraction, Adventure sports]","Adventure park with chairlift, zip-line & tree...",1.1057734584587012e+20_1531431690487,0
186530,5,Awesome,Rushmore Tramway Adventures,"[Tourist attraction, Adventure sports]","Adventure park with chairlift, zip-line & tree...",1.1081098441501137e+20_1528673082548,0


In [17]:
df_preprocessed_C_D1 = preprocess_policy_D1(df_preprocessed_C)
df_preprocessed_C_D1

Unnamed: 0,rating,text,business_name,business_category,business_description,_id,result
0,1,Ordered a venti blonde salted caramel mocha a...,Starbucks,"[Coffee shop, Cafe, Coffee store, Espresso bar]",Seattle-based coffeehouse chain known for its ...,1.0507798098587045e+20_1598647132077,0
1,3,"My experiences were okay, the drinks were good...",Starbucks,"[Coffee shop, Cafe, Coffee store, Espresso bar]",Seattle-based coffeehouse chain known for its ...,1.1267481698306823e+20_1607019739498,0
2,5,Yummy,Starbucks,"[Coffee shop, Cafe, Coffee store, Espresso bar]",Seattle-based coffeehouse chain known for its ...,1.135714341354508e+20_1617357167910,0
3,5,"Andrea is a pro! The computer was down, she wa...",Starbucks,"[Coffee shop, Cafe, Coffee store, Espresso bar]",Seattle-based coffeehouse chain known for its ...,1.0849846491577998e+20_1520781652933,0
4,5,Consistantly good,Starbucks,"[Coffee shop, Cafe, Coffee store, Espresso bar]",Seattle-based coffeehouse chain known for its ...,1.1503982235122152e+20_1579458022307,0
...,...,...,...,...,...,...,...
186527,5,Fun,Rushmore Tramway Adventures,"[Tourist attraction, Adventure sports]","Adventure park with chairlift, zip-line & tree...",1.1582266652005546e+20_1561943281610,0
186528,5,Super fun,Rushmore Tramway Adventures,"[Tourist attraction, Adventure sports]","Adventure park with chairlift, zip-line & tree...",1.1257190083569926e+20_1540065452705,0
186529,5,Awesome,Rushmore Tramway Adventures,"[Tourist attraction, Adventure sports]","Adventure park with chairlift, zip-line & tree...",1.1057734584587012e+20_1531431690487,0
186530,5,Awesome,Rushmore Tramway Adventures,"[Tourist attraction, Adventure sports]","Adventure park with chairlift, zip-line & tree...",1.1081098441501137e+20_1528673082548,0


In [18]:
df_preprocessed_C_D1_F = preprocess_policy_F(df_preprocessed_C_D1)
df_preprocessed_C_D1_F

  condition = (df['text'].str.contains(combined_pattern, regex=True))


Unnamed: 0,rating,text,business_name,business_category,business_description,_id,result
0,1,Ordered a venti blonde salted caramel mocha a...,Starbucks,"[Coffee shop, Cafe, Coffee store, Espresso bar]",Seattle-based coffeehouse chain known for its ...,1.0507798098587045e+20_1598647132077,0
1,3,"My experiences were okay, the drinks were good...",Starbucks,"[Coffee shop, Cafe, Coffee store, Espresso bar]",Seattle-based coffeehouse chain known for its ...,1.1267481698306823e+20_1607019739498,0
2,5,Yummy,Starbucks,"[Coffee shop, Cafe, Coffee store, Espresso bar]",Seattle-based coffeehouse chain known for its ...,1.135714341354508e+20_1617357167910,0
3,5,"Andrea is a pro! The computer was down, she wa...",Starbucks,"[Coffee shop, Cafe, Coffee store, Espresso bar]",Seattle-based coffeehouse chain known for its ...,1.0849846491577998e+20_1520781652933,0
4,5,Consistantly good,Starbucks,"[Coffee shop, Cafe, Coffee store, Espresso bar]",Seattle-based coffeehouse chain known for its ...,1.1503982235122152e+20_1579458022307,0
...,...,...,...,...,...,...,...
186527,5,Fun,Rushmore Tramway Adventures,"[Tourist attraction, Adventure sports]","Adventure park with chairlift, zip-line & tree...",1.1582266652005546e+20_1561943281610,0
186528,5,Super fun,Rushmore Tramway Adventures,"[Tourist attraction, Adventure sports]","Adventure park with chairlift, zip-line & tree...",1.1257190083569926e+20_1540065452705,0
186529,5,Awesome,Rushmore Tramway Adventures,"[Tourist attraction, Adventure sports]","Adventure park with chairlift, zip-line & tree...",1.1057734584587012e+20_1531431690487,0
186530,5,Awesome,Rushmore Tramway Adventures,"[Tourist attraction, Adventure sports]","Adventure park with chairlift, zip-line & tree...",1.1081098441501137e+20_1528673082548,0


#### 3. Sample subset data for demo purpose

In [19]:
sample_0 = df_preprocessed_C_D1_F[df_preprocessed_C_D1_F['result'] == 0].sample(2, random_state=40)
sample_none = df_preprocessed_C_D1_F[df_preprocessed_C_D1_F['result'].isna()].sample(8, random_state=40)
df_test = pd.concat([sample_0, sample_none]).reset_index(drop=True)
df_test

Unnamed: 0,rating,text,business_name,business_category,business_description,_id,result
0,5,Excellent accommodations.,Best Western Ramkota Hotel,[Hotel],"Casual hotel with free Wi-Fi & parking, plus a...",1.1083274779108793e+20_1483974272359,0.0
1,5,Incredible,Badlands National Park,"[National park, Tourist attraction]","244,000 rugged acres of geological formations,...",1.0131807086795317e+20_1525662824665,0.0
2,5,Outstanding. I get excited every time someone ...,JL Beers,"[Bar, Bar & grill, Hamburger restaurant]",Relaxed hangout featuring burgers with creativ...,1.1376567741363639e+20_1484332943453,
3,4,It was so peaceful and pretty!,Palisades State Park,"[State park, Tourist attraction]","A park with quartzite formations, pipestone qu...",1.1372999123282656e+20_1532908672384,
4,2,Food quality average and prices too expensive.,Spezia Restaurant,[Italian restaurant],Relaxed Italian specialist offering charbroile...,1.1481787428915382e+20_1549690002630,
5,4,Homemade pizza that you can watch them make th...,Pizza Works,"[Pizza restaurant, Italian restaurant]","Pizza, pasta & salads served in a charming, in...",1.1320780188754972e+20_1562807452370,
6,5,Love everything about this place! From the uni...,Remedy Brewing Company,"[Brewpub, Brewery]",Spacious warehouse setting for house beers & f...,1.1663413847587925e+20_1616974614188,
7,4,(Translated by Google) Very good place\n\n(Ori...,Super 8 by Wyndham Sioux Falls Near Convention...,"[Hotel, Inn, Lodge]",Contemporary budget property offering a busine...,1.151275183865607e+20_1530207005919,
8,4,Always has great prices,Once Upon A Child,"[Children's clothing store, Clothing store, De...",Chain operation that buys & sells gently used ...,1.1268750627621305e+20_1512763026073,
9,5,Good old standby for what ever we need...lunch...,SF 41st Fryn' Pan Family Restaurant,"[Restaurant, Family restaurant]",Local chain serving breakfast & homestyle Amer...,1.1100517661351048e+20_1565527187474,


#### 4. Model-Based Feature Engineering (Policy A, B, D2, E, G)

##### Policy A

In [12]:
df_train = pd.read_csv("final_lda_train_data.csv")

In [16]:
from feature_engineering_model.modules.policy_A_module import policy_A_feature_generation_v3
feature_A = policy_A_feature_generation_v3(df_train, df_test)

Training LDA model on the provided training DataFrame...
LDA model training complete.
Calculating similarity scores for df_test...


##### Policy B

In [20]:
from feature_engineering_model.modules.policy_B_module import calculate_specificity_score
feature_B = calculate_specificity_score(df_test, n_workers=4)

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>
[nltk_data] Error loading punkt_tab: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>
[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify
[nltk_data]     failed: unable to get local issuer certificate
[nltk_data]     (_ssl.c:1000)>
[nltk_data] Error loading maxent_ne_chunker: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>
[nltk_data] Error loading words: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer cert

##### Policy D2

In [26]:
from feature_engineering_model.modules.policy_D2_module import calculate_interpretability_scores_for_df
feature_D2 = calculate_interpretability_scores_for_df(df_test, n_workers=4)

##### Policy E

In [27]:
from feature_engineering_model.modules.policy_E_module import compute_consistency_scores
feature_E = compute_consistency_scores(df_test, max_workers=4)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassifi

##### Policy G

In [28]:
from feature_engineering_model.modules.policy_G_module import compute_policy_g_series_processed
feature_G = compute_policy_g_series_processed(df_test)

#### 5. Combining Extracted Features for input into Model

In [None]:
df_10 = pd.concat([feature_A, feature_B, feature_D2, feature_E, feature_G], axis=1)
df_10 = df_10.rename(columns={0: "A", 1: "B", 2: "D2", 3:"E", 4:"G"})
df_10 = df_10.drop("Unnamed: 0", axis = 1)
print(df_10)

          A         B        D2         E         G
0  0.032823  0.534087  0.000000  0.973373  0.999569
1  0.023957  0.510122  0.100000  0.901971  0.999585
2  0.017299  0.555200  0.595120  0.991566  0.999293
3  0.018025  0.537722  0.500000  0.761343  0.999658
4  0.017456  0.561826  0.500000  0.796382  0.999474
5  0.389479  0.567430  0.640200  0.758484  0.999005
6  0.144338  0.595389  0.582471  0.992629  0.999540
7  0.126351  0.567221  0.500000  0.806626  0.999491
8  0.122423  0.555855  0.500000  0.785138  0.999578
9  0.183515  0.564866  0.500000  0.847798  0.999045


#### 6. Model Prediction from the input data into a desired Dataframe output

In [30]:
mask = df_test["result"].isna()
result_probs = model.predict(df_10[mask])
preds = (result_probs > 0.5).astype(int)
df_test.loc[mask, "result"] = preds

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step


#### 7. Results

In [31]:
df_test

Unnamed: 0,rating,text,business_name,business_category,business_description,_id,result
0,5,Excellent accommodations.,Best Western Ramkota Hotel,[Hotel],"Casual hotel with free Wi-Fi & parking, plus a...",1.1083274779108793e+20_1483974272359,0
1,5,Incredible,Badlands National Park,"[National park, Tourist attraction]","244,000 rugged acres of geological formations,...",1.0131807086795317e+20_1525662824665,0
2,5,Outstanding. I get excited every time someone ...,JL Beers,"[Bar, Bar & grill, Hamburger restaurant]",Relaxed hangout featuring burgers with creativ...,1.1376567741363639e+20_1484332943453,1
3,4,It was so peaceful and pretty!,Palisades State Park,"[State park, Tourist attraction]","A park with quartzite formations, pipestone qu...",1.1372999123282656e+20_1532908672384,0
4,2,Food quality average and prices too expensive.,Spezia Restaurant,[Italian restaurant],Relaxed Italian specialist offering charbroile...,1.1481787428915382e+20_1549690002630,0
5,4,Homemade pizza that you can watch them make th...,Pizza Works,"[Pizza restaurant, Italian restaurant]","Pizza, pasta & salads served in a charming, in...",1.1320780188754972e+20_1562807452370,1
6,5,Love everything about this place! From the uni...,Remedy Brewing Company,"[Brewpub, Brewery]",Spacious warehouse setting for house beers & f...,1.1663413847587925e+20_1616974614188,1
7,4,(Translated by Google) Very good place\n\n(Ori...,Super 8 by Wyndham Sioux Falls Near Convention...,"[Hotel, Inn, Lodge]",Contemporary budget property offering a busine...,1.151275183865607e+20_1530207005919,1
8,4,Always has great prices,Once Upon A Child,"[Children's clothing store, Clothing store, De...",Chain operation that buys & sells gently used ...,1.1268750627621305e+20_1512763026073,0
9,5,Good old standby for what ever we need...lunch...,SF 41st Fryn' Pan Family Restaurant,"[Restaurant, Family restaurant]",Local chain serving breakfast & homestyle Amer...,1.1100517661351048e+20_1565527187474,1
