In [1]:
import pandas as pd
import numpy as np

In [2]:
# read text sequence dataset
train_seq_df = pd.read_csv("datasets/train/train_text_seq.csv")
train_seq_Y = train_seq_df['label']

val_seq_df = pd.read_csv("datasets/valid/valid_text_seq.csv")
val_seq_Y = val_seq_df['label']

In [3]:
train_seq_df.head()

Unnamed: 0,input_str,label
0,0000154364642718159661428002624223132284159626...,0
1,0004641596369515436422262614110471596262476161...,0
2,0001543626215965999614422464135806142624051159...,0
3,0000154364224641238614262159689561596284351061...,1
4,0004641899422154362069015966142624761262159661...,1


In [12]:
split_columns = train_seq_df['input_str'].apply(lambda x: pd.Series(list(x)))
split_columns.columns = [f'col_{i+1}' for i in range(50)]

train_X = pd.DataFrame()

# One-hot encode each column, ensuring all 500 columns are created
for i in range(50):
    # Create 10 columns for digits 0-9 for each original column position
    one_hot = pd.get_dummies(split_columns[f'col_{i+1}'], prefix=f'col_{i+1}')
    
    # Ensure all columns 'col_i_0' to 'col_i_9' are present, filling missing ones with 0
    for j in range(10):
        column_name = f'col_{i+1}_{j}'
        if column_name not in one_hot:
            one_hot[column_name] = 0
    
    # Add the one-hot encoded columns for this position to the final dataframe
    train_X = pd.concat([train_X, one_hot], axis=1)

# Ensure the columns are in the correct order (col_1_0, col_1_1, ..., col_50_9)
train_X = train_X[sorted(train_X.columns, key=lambda x: (int(x.split('_')[1]), int(x.split('_')[2])))]

print(train_X.shape)


(7080, 500)


In [14]:
train_X.head()

Unnamed: 0,col_1_0,col_1_1,col_1_2,col_1_3,col_1_4,col_1_5,col_1_6,col_1_7,col_1_8,col_1_9,...,col_50_0,col_50_1,col_50_2,col_50_3,col_50_4,col_50_5,col_50_6,col_50_7,col_50_8,col_50_9
0,True,0,0,0,0,0,0,0,0,0,...,False,False,False,False,True,False,False,False,False,False
1,True,0,0,0,0,0,0,0,0,0,...,False,False,False,False,True,False,False,False,False,False
2,True,0,0,0,0,0,0,0,0,0,...,False,False,False,False,True,False,False,False,False,False
3,True,0,0,0,0,0,0,0,0,0,...,False,False,True,False,False,False,False,False,False,False
4,True,0,0,0,0,0,0,0,0,0,...,False,False,False,False,True,False,False,False,False,False


In [15]:
split_columns = val_seq_df['input_str'].apply(lambda x: pd.Series(list(x)))
split_columns.columns = [f'col_{i+1}' for i in range(50)]

val_X = pd.DataFrame()

# One-hot encode each column, ensuring all 500 columns are created
for i in range(50):
    # Create 10 columns for digits 0-9 for each original column position
    one_hot = pd.get_dummies(split_columns[f'col_{i+1}'], prefix=f'col_{i+1}')
    
    # Ensure all columns 'col_i_0' to 'col_i_9' are present, filling missing ones with 0
    for j in range(10):
        column_name = f'col_{i+1}_{j}'
        if column_name not in one_hot:
            one_hot[column_name] = 0
    
    # Add the one-hot encoded columns for this position to the final dataframe
    val_X = pd.concat([val_X, one_hot], axis=1)

# Ensure the columns are in the correct order (col_1_0, col_1_1, ..., col_50_9)
val_X = val_X[sorted(val_X.columns, key=lambda x: (int(x.split('_')[1]), int(x.split('_')[2])))]

print(val_X.shape)


(489, 500)


In [16]:
# split the data into 5 forms one consisting of 20% of the data 40% of the data 60% of the data 80% of the data and 100% of the data
from sklearn.model_selection import train_test_split
X_train_20, X_test_20, y_train_20, y_test_20 = train_test_split(train_X, train_seq_Y, test_size=0.8, random_state=42)
X_train_40, X_test_40, y_train_40, y_test_40 = train_test_split(train_X, train_seq_Y, test_size=0.6, random_state=42)
X_train_60, X_test_60, y_train_60, y_test_60 = train_test_split(train_X, train_seq_Y, test_size=0.4, random_state=42)
X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(train_X, train_seq_Y, test_size=0.2, random_state=42)
X_train_100 = train_X
y_train_100 = train_seq_Y

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [18]:
rf_20 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_20.fit(X_train_20, y_train_20)
y_pred_20 = rf_20.predict(val_X)
accuracy_20 = accuracy_score(val_seq_Y, y_pred_20)
print("Accuracy of 20% of the data: ", accuracy_20)


Accuracy of 20% of the data:  0.5521472392638037


In [19]:
rf_40 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_40.fit(X_train_40, y_train_40)
y_pred_40 = rf_40.predict(val_X)
accuracy_40 = accuracy_score(val_seq_Y, y_pred_40)
print("Accuracy of 40% of the data: ", accuracy_40)


Accuracy of 40% of the data:  0.5807770961145194


In [20]:
rf_60 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_60.fit(X_train_60, y_train_60)
y_pred_60 = rf_60.predict(val_X)
accuracy_60 = accuracy_score(val_seq_Y, y_pred_60)
print("Accuracy of 60% of the data: ", accuracy_60)


Accuracy of 60% of the data:  0.6053169734151329


In [21]:
rf_80 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_80.fit(X_train_80, y_train_80)
y_pred_80 = rf_80.predict(val_X)
accuracy_80 = accuracy_score(val_seq_Y, y_pred_80)
print("Accuracy of 80% of the data: ", accuracy_80)


Accuracy of 80% of the data:  0.623721881390593


In [22]:
rf_100 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_100.fit(X_train_100, y_train_100)
y_pred_100 = rf_100.predict(val_X)
accuracy_100 = accuracy_score(val_seq_Y, y_pred_100)
print("Accuracy of 100% of the data: ", accuracy_100)


Accuracy of 100% of the data:  0.656441717791411


In [30]:
# train a neural network model  
from sklearn.neural_network import MLPClassifier

In [29]:
mlp_20 = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=1000, random_state=42)
mlp_20.fit(X_train_20, y_train_20)
y_pred_20 = mlp_20.predict(val_X)
accuracy_20 = accuracy_score(val_seq_Y, y_pred_20)
print("Accuracy of 20% of the data: ", accuracy_20)


Accuracy of 20% of the data:  0.5971370143149284


In [31]:
mlp_40 = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=1000, random_state=42)
mlp_40.fit(X_train_40, y_train_40)
y_pred_40 = mlp_40.predict(val_X)
accuracy_40 = accuracy_score(val_seq_Y, y_pred_40)
print("Accuracy of 40% of the data: ", accuracy_40)


Accuracy of 40% of the data:  0.6114519427402862


In [32]:
mlp_60 = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=1000, random_state=42)
mlp_60.fit(X_train_60, y_train_60)
y_pred_60 = mlp_60.predict(val_X)
accuracy_60 = accuracy_score(val_seq_Y, y_pred_60)
print("Accuracy of 60% of the data: ", accuracy_60)

Accuracy of 60% of the data:  0.6339468302658486


In [33]:
mlp_80 = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=1000, random_state=42)
mlp_80.fit(X_train_80, y_train_80)
y_pred_80 = mlp_80.predict(val_X)
accuracy_80 = accuracy_score(val_seq_Y, y_pred_80)
print("Accuracy of 80% of the data: ", accuracy_80)


Accuracy of 80% of the data:  0.6441717791411042


In [34]:
mlp_100 = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=1000, random_state=42)
mlp_100.fit(X_train_100, y_train_100)
y_pred_100 = mlp_100.predict(val_X)
accuracy_100 = accuracy_score(val_seq_Y, y_pred_100)
print("Accuracy of 100% of the data: ", accuracy_100)

Accuracy of 100% of the data:  0.6523517382413088
