In [None]:
from algorithms import *
from keras.layers import Dense
from keras.models import Sequential
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import numpy as np
import os
import pandas as pd
import pickle
import time
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE

In [None]:
#extract development and evaluation
df = pd.read_csv("dsl_data/development.csv")
df_eval = pd.read_csv("dsl_data/evaluation.csv")

In [None]:
cols_to_be_encoded = ['gender','ageRange']

for i in cols_to_be_encoded:
    label_encoder(df, i)
    label_encoder(df_eval, i)

In [None]:
balance_trainset_based_on_test(df,df_eval)

In [None]:
cols = ['Id','Self-reported fluency level ', 'First Language spoken', 'Current language used for work/school']
df.drop(columns=cols,inplace=True)
df_eval.drop(columns=cols[:4],inplace=True)

In [None]:
if os.path.isfile("df.pkl"):
    with open('df.pkl', 'rb') as f:
        df = pickle.load(f)
    with open('df_eval.pkl', 'rb') as f:
        df_eval = pickle.load(f)
else:
    df = extract_all_features(df)
    df_eval = extract_all_features(df_eval)

In [None]:
target_class = df['action']+ df['object']
encoder = LabelEncoder()
y = encoder.fit_transform(target_class)

In [None]:
X = df.drop(columns=['path', 'speakerId','action','object'])

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(np.array(X, dtype = float))

In [None]:
X_eval = df_eval.drop(columns=['path', 'speakerId' ])

In [None]:
X_eval = scaler.fit_transform(np.array(X_eval, dtype = float))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2,random_state = 42, shuffle = True)

In [None]:
svm = SVC(kernel="linear")

# Initialize RFE
rfe = RFE(svm)

# Define the grid of values for the number of features to select and the accuracy required
param_grid = {'n_features_to_select':[10,20,30],
              'estimator__C':[0.1, 1, 10],
              'estimator__kernel':['linear', 'rbf', 'poly'],
              'estimator__gamma': [0.1, 1, 10]}

# Define the scoring function
acc_scorer = make_scorer(accuracy_score)

# Initialize GridSearchCV
grid_search = GridSearchCV(rfe, param_grid, scoring=acc_scorer)

# Fit the grid_search to the data
grid_search.fit(X, y)

# Print the best parameters and the best score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

# Print the selected features
print(grid_search.best_estimator_.support_)

In [None]:
def svm_model(X_train, y_train, X_test):
    clf = SVC(C = 10, gamma = 0.1, kernel = 'rbf')
    # train the model on the training data
    clf.fit(X_train, y_train)
    # predict the target values for the test data
    # returning the y_predict
    return clf.predict(X_test)
y_pred = svm_model(X_train, y_train, X_test)

In [None]:
accuracy_calculator(y_test, y_pred)

In [None]:
model = Sequential()
model.add(Dense(512, input_shape=(X.shape[1],), activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(y), activation='softmax'))

In [None]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
history = model.fit(X,
                    y,
                    epochs=150,
                    batch_size=500)

In [None]:
predictions = model.predict(X_eval)

In [None]:
y_pred_classes = predictions.argmax(axis=-1)

# Convert the predicted class labels back to the original target classes
y_pred_classes_decoded = encoder.inverse_transform(y_pred_classes)

# Convert the decoded predictions to a pandas Series
y_pred_classes_decoded = pd.Series(y_pred_classes_decoded, name='Predicted')


In [None]:
y_evaluation_df = pd.DataFrame(y_pred_classes_decoded, columns = ['Predicted'])
y_evaluation_df.index.name = 'Id'

from datetime import datetime
now = int(time.time())
readable_time = datetime.fromtimestamp(now).strftime('%H:%M:%S')
y_evaluation_df.to_csv(f'evaluation/copy_predictions-{readable_time}.csv')

In [None]:
if not os.path.isfile("df.pkl"):
    with open('df.pkl', 'rb') as f:
        pickle.dump(df, f)
    with open('df_eval.pkl', 'rb') as f:
        pickle.dump(df_eval, f)