In [2]:
import pandas as pd
import numpy as np
import re
import plotly.express as px

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix

# Model 1:

## Can we determine if a particular tweet is relevant to China or not ? 

### Data Preparation

Firstly, we need to remove useless columns (ids) and drop any rows who have a NaN value in the Bucket column since that is our target variable. We also can convert Bucket to a boolean column for ease of understanding by making a new column marking if a row is in Bucket 1 or not. 

In [8]:
df = pd.read_csv("../Data/sampleData.csv")
df = df[df["country"] == "China"]

df[df["SentimentScore"] > 3.0]['text'][15418]
# df.dropna(subset = ["Bucket"])
# df["Relevant"] = df["Bucket"].apply(lambda x: x == "1")
# df = df[["text", "Relevant"]]
# df.head()

'The CCP should take note. Asia is fed up with territorial breaches in the South China Sea. Canada &amp; Australia suspended extradition treaties with Hong Kong. The U.K. canceled Huawei over Chinese spying. @POTUS &amp; @SecPompeo continue to hold Beijing accountable.'

In [3]:
def text_cleaning(text):
    
    text = text.lower()
    text = re.sub('[^A-Za-z0-9\s]', '', text)
    return text

df['text'] = df['text'].apply(text_cleaning)

### Modeling

We can also determine our x (feature) and y (target) variables early on as well. 

In [4]:
x = df['text']
y = df['Relevant']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42, shuffle = True)

In [5]:
dt_classifier = Pipeline([('tfidf', TfidfVectorizer()), ('clf', DecisionTreeClassifier())])
svm_classifier = Pipeline([('tfidf', TfidfVectorizer()), ('svm', SVC())])

dt_classifier.fit(x_train, y_train)
svm_classifier.fit(x_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('svm', SVC())])

In [6]:
# dt 
preds_dt = dt_classifier.predict(x_test)
acc_dt = (preds_dt == y_test).mean()

# svc 
preds_svm = svm_classifier.predict(x_test)
acc_svm = (preds_svm == y_test).mean()

[acc_dt, acc_svm]

[0.692429022082019, 0.7867507886435331]

In [7]:
confusion_matrix(preds_dt, y_test)

array([[ 491,  492],
       [ 483, 1704]])

In [8]:
confusion_matrix(preds_svm, y_test)

array([[ 455,  157],
       [ 519, 2039]])