# Giovanni & Oscar | ML Final Project | Milestone 3

Continuation of Milestone 2's Ipynb

## Points (10 pts)
## Tasks:

1. Add a section to your colab with at least 2 different methods for learning about the data and understanding the differences between the classes.
    > This can include visualization like histograms or tables/lists of features and their counts in each class, for example.
    > The goal of data analysis is to help you understand the task more deeply.

2. Everyone should submit a colab via Classroom even though we expect partners to submit identical copies.


### 0. Setup Dependencies and Milestone 2 Code

In [1]:
# Ensure that we have the newest version of pip installed
%pip install -q --upgrade pip

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m1.2/2.1 MB[0m [31m35.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.0/2.1 MB[0m [31m34.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Install necessary libraries
%pip install -q numpy
%pip install -q pandas
%pip install -q matplotlib
%pip install -q seaborn

# Helps avoid showing plots in a separate line
# %matplotlib inline

%pip install -q scikit-learn

[0m

In [3]:
# Import the modules
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import datasets

# Set the styling of the plt plots to darkgrid
sns.set(style='darkgrid')

# Removes error messsages and sets precision to 3 decimal places
import warnings
warnings.filterwarnings('ignore')
np.set_printoptions(precision=3, suppress=True)

#### Load the Data in a Colab Notebook

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# Please update the Path here to the location of your train.csv and test.csv files
path_to_csv = 'Data/techexchange-2023-ml-project'

# Load the Data Frames from the Training and Testing Data Frame
train_df = pd.read_csv(f'{path_to_csv}/train.csv')
test_df = pd.read_csv(f'{path_to_csv}/test.csv')
display(train_df.head())

FileNotFoundError: ignored

In [None]:
# Separate Data Frame for input and outputs

input_names = ['id', 'keyword', 'location', 'text']
input_df = train_df[input_names]
display(input_df.head())

output_names = ['target']
output_df = train_df[output_names]
display(output_df.head())

In [None]:
# Convert into numpy data
X_data = input_df.to_numpy()
Y_data = output_df.to_numpy().flatten()

print(X_data[:5])
print(Y_data[:5])

print(X_data.shape)
print(Y_data.shape)

#### Split the training data into 90% training and 10% for validation (your experiments)

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, train_size=0.90)

In [None]:
# Going through a few examples in the training split

for index in range(2):
    print('Text')
    print(X_train[index])
    print()
    
    print('Is Natural Disaster?')
    print(Y_train[index], 'Yes' if Y_train[index] == 1 else 'No')
    print('======================')
    print()

#### Testing and Submitting a Baseline

In [None]:
# Returns a positive result, regardless of the input
def baseline_model(text_inputs):
    return 1

baseline_model_np = np.vectorize(baseline_model, signature='(n) -> ()')

In [None]:
baseline_predictions_train = baseline_model_np(X_train)

for i in range(5):
    print('Input:')
    print(X_train[i])
    print()

    print('Output')
    print(Y_train[i])
    print()

    print('Prediction')
    print(baseline_predictions_train[i])
    print('==================')
    print()

In [None]:
# Calculates Log Loss
def calculate_loss(labels, predictions):
    epsilon = 0.000001  # Prevents taking the natural log of non-positive values
    ce_values = -labels * np.log(predictions + epsilon) - (1 - labels) * np.log(1 - predictions + epsilon)
    loss = ce_values.mean()
    return loss

In [None]:
training_loss = calculate_loss(Y_train, baseline_predictions_train)
print('Training Loss:', training_loss)

# Run on the validation data
baseline_predictions_test = baseline_model_np(X_test)
testing_loss = calculate_loss(Y_test, baseline_predictions_test)
print('Testing Loss:', testing_loss)

In [None]:
# Create the submission CSV file for our Kaggle submission
def save_to_submissions_csv(text_inputs, prediction_labels):
    print('Generating "submission.csv" file...')

    # Extract the ids of the text inputs and flatten to a 1D ndarray
    test_ids = text_inputs[:,0].flatten()

    # Write the submission file and save to 'submission.csv'
    np.savetxt(
        'submission.csv',
        np.rec.fromarrays([test_ids, prediction_labels]),
        fmt=['%s', '%d'],
        delimiter=',',
        header='id,target',
        comments=''
    )

    # Show success!
    print('Successfully created "submission.csv"')

In [None]:
# Reformat the single training dataframe to an input dataframe
input_names = ['id', 'keyword', 'location', 'text']
test_input_df = test_df[input_names]

# Reformat the input dataframe into a numpy array for running through our model
test_input_np = test_input_df.to_numpy()

# Predict by using the baseline model on the test input and save to a .csv
baseline_predictions_test = baseline_model_np(test_input_np)
save_to_submissions_csv(test_input_np, baseline_predictions_test)

In [None]:
# Look at the first few predictions to ensure things went smoothly
pd.read_csv('submission.csv').head()

### 1. Visualize and Learn about the Data we are working with

In [None]:
from collections import Counter

def plot_frequency(tweets_np, labels_np, word_num=10):
    # Creates a counter that keeps track of the frequency of words (similar to defaultdict)
    pos_counter = Counter()
    neg_counter = Counter()

    tweets_np = np.copy(tweets_np) # Deep Copy of input

    # Go through the tweets dataset
    for entry_index in range(tweets_np.shape[0]):
        # Flatten all of the features into a single string
        words = ' '.join([str(feature) for feature in tweets_np[entry_index]])
        # Count the frequency of each word
        for word in words.split():
            word = word.lower()
            # Links would count as independent words without this
            if word.startswith('http'):
                word = '<LINK>'
            # Articles would count as independent words without this
            elif word in ['the', 'a', 'an']:
                word = '<ARTICLE>'
            
            if labels_np[entry_index]:
                pos_counter[word] += 1
            else:
                neg_counter[word] += 1
    
    # Place the data into a data frame in sorted order
    top_words = pos_counter.most_common(word_num)
    result = {} # TODO Add the logic to put word counts into dictionary here
    word_count_df = pd.DataFrame(data=result, columns=['word', 'pos count', 'neg count'])
    display(word_count_df)


In [None]:
plot_frequency(X_data, Y_data)