Necessary Imports

In [139]:
import pandas as pd
import re
from sklearn import svm
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# model = Perceptron()
# model = svm.SVC()
# model = KNeighborsClassifier(n_neighbors=4)
model = GaussianNB()

Read in Data, drop certain cells from database and turn all fields into numbers

In [140]:
# Read in Data
db = pd.read_csv("countries.csv", encoding='latin-1')

# Columns we do not want to consider
to_drop = ["CountryID", "Country Name", "WEBNAME", "World Rank", "Region Rank", "2019 Score", "Country", "FDI Inflow (Millions)"]
db = db.drop(to_drop, axis=1)

# Drop rows with na columns
db = db.dropna()

# Turn Regions into integers
REGIONS = ["Asia-Pacific", "Europe", "Middle East and North Africa", "Sub-Saharan Africa", "Americas"]
db.Region = db.Region.apply(REGIONS.index)

# Turn everything into floats
def floatify(st):
    try:
        return float(st)
    except:
        st = st[1:].replace(",", "")
        st = re.sub(r'(\d+\.\d*?)\D.*', "\\1", st)
        return float(st)
    
for col in db.columns:
    db[col] = db[col].apply(floatify)
    
# Add a column for unemployment quartile (1-4)
quartiles = []
for value in db["Unemployment (%)"]:
    if value <= 3.7:
        quartiles.append(1)
    elif value <= 5.55:
        quartiles.append(2)
    elif value <= 9.3:
        quartiles.append(3)
    else:
        quartiles.append(4)
        
db["Unemployment Quartile"] = quartiles

# Drop unemployment:
db = db.drop(["Unemployment (%)"], axis=1)

Turn dataframe into lists of evidence and labels

In [141]:
evidence, labels = db.drop(columns = ['Unemployment Quartile']), db[['Unemployment Quartile']]
evindence = evidence.values.tolist()
labels = [ls[0] for ls in labels.values.tolist()]

Split into training/testing and train the model

In [142]:
X_training, X_testing, y_training, y_testing = train_test_split(
    evidence, labels, test_size=0.2
)

# Fir the model:
model.fit(X_training, y_training)

GaussianNB(priors=None)

Predicting and testing

In [143]:
# Make predictions on the testing set
predictions = model.predict(X_testing)

# Compute how well we performed
correct = (y_testing == predictions).sum()
incorrect = (y_testing != predictions).sum()
total = len(predictions)

# Print results
print(f"Results for model {type(model).__name__}")
print(f"Correct: {correct}")
print(f"Incorrect: {incorrect}")
print(f"Accuracy: {100 * correct / total:.2f}%")

Results for model GaussianNB
Correct: 15
Incorrect: 20
Accuracy: 42.86%
