In [1]:
import pandas as pd
import numpy as np

In [2]:
# read text sequence dataset
train_seq_df = pd.read_csv("datasets/train/train_text_seq.csv")
train_seq_Y = train_seq_df['label']

val_seq_df = pd.read_csv("datasets/valid/valid_text_seq.csv")
val_seq_Y = val_seq_df['label']

In [3]:
train_seq_df.head()

Unnamed: 0,input_str,label
0,0000154364642718159661428002624223132284159626...,0
1,0004641596369515436422262614110471596262476161...,0
2,0001543626215965999614422464135806142624051159...,0
3,0000154364224641238614262159689561596284351061...,1
4,0004641899422154362069015966142624761262159661...,1


In [4]:
train_X = train_seq_df['input_str'].apply(lambda x: pd.Series(list(x)))
train_X.columns = [f'col_{i+1}' for i in range(50)]
train_X = train_X.astype(int)

In [5]:
train_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7080 entries, 0 to 7079
Data columns (total 50 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   col_1   7080 non-null   int32
 1   col_2   7080 non-null   int32
 2   col_3   7080 non-null   int32
 3   col_4   7080 non-null   int32
 4   col_5   7080 non-null   int32
 5   col_6   7080 non-null   int32
 6   col_7   7080 non-null   int32
 7   col_8   7080 non-null   int32
 8   col_9   7080 non-null   int32
 9   col_10  7080 non-null   int32
 10  col_11  7080 non-null   int32
 11  col_12  7080 non-null   int32
 12  col_13  7080 non-null   int32
 13  col_14  7080 non-null   int32
 14  col_15  7080 non-null   int32
 15  col_16  7080 non-null   int32
 16  col_17  7080 non-null   int32
 17  col_18  7080 non-null   int32
 18  col_19  7080 non-null   int32
 19  col_20  7080 non-null   int32
 20  col_21  7080 non-null   int32
 21  col_22  7080 non-null   int32
 22  col_23  7080 non-null   int32
 23  col_24  7080 

In [6]:
val_X = val_seq_df['input_str'].apply(lambda x: pd.Series(list(x)))
val_X.columns = [f'col_{i+1}' for i in range(50)]
val_X = val_X.astype(int)

In [7]:
# split the data into 5 forms one consisting of 20% of the data 40% of the data 60% of the data 80% of the data and 100% of the data
from sklearn.model_selection import train_test_split
X_train_20, X_test_20, y_train_20, y_test_20 = train_test_split(train_X, train_seq_Y, test_size=0.8, random_state=42)
X_train_40, X_test_40, y_train_40, y_test_40 = train_test_split(train_X, train_seq_Y, test_size=0.6, random_state=42)
X_train_60, X_test_60, y_train_60, y_test_60 = train_test_split(train_X, train_seq_Y, test_size=0.4, random_state=42)
X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(train_X, train_seq_Y, test_size=0.2, random_state=42)
X_train_100 = train_X
y_train_100 = train_seq_Y

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [51]:
rf_20 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_20.fit(X_train_20, y_train_20)
y_pred_20 = rf_20.predict(val_X)
accuracy_20 = accuracy_score(val_seq_Y, y_pred_20)
print("Accuracy of 20% of the data: ", accuracy_20)


Accuracy of 20% of the data:  0.5521472392638037


In [52]:
rf_40 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_40.fit(X_train_40, y_train_40)
y_pred_40 = rf_40.predict(val_X)
accuracy_40 = accuracy_score(val_seq_Y, y_pred_40)
print("Accuracy of 40% of the data: ", accuracy_40)


Accuracy of 40% of the data:  0.6053169734151329


In [53]:
rf_60 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_60.fit(X_train_60, y_train_60)
y_pred_60 = rf_60.predict(val_X)
accuracy_60 = accuracy_score(val_seq_Y, y_pred_60)
print("Accuracy of 60% of the data: ", accuracy_60)


Accuracy of 60% of the data:  0.6053169734151329


In [54]:
rf_80 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_80.fit(X_train_80, y_train_80)
y_pred_80 = rf_80.predict(val_X)
accuracy_80 = accuracy_score(val_seq_Y, y_pred_80)
print("Accuracy of 80% of the data: ", accuracy_80)


Accuracy of 80% of the data:  0.5950920245398773


In [55]:
rf_100 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_100.fit(X_train_100, y_train_100)
y_pred_100 = rf_100.predict(val_X)
accuracy_100 = accuracy_score(val_seq_Y, y_pred_100)
print("Accuracy of 100% of the data: ", accuracy_100)


Accuracy of 100% of the data:  0.6339468302658486


In [9]:
# train a deep learning model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from keras.layers import Embedding
from keras.preprocessing import sequence

In [14]:
# create the model
model = Sequential()
model.add(Embedding(256, 16, input_length=50))
model.add(Conv1D(filters=8, kernel_size=4, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 50, 16)            4096      
                                                                 
 conv1d_2 (Conv1D)           (None, 50, 8)             520       
                                                                 
 max_pooling1d_2 (MaxPoolin  (None, 25, 8)             0         
 g1D)                                                            
                                                                 
 flatten_2 (Flatten)         (None, 200)               0         
                                                                 
 dense_4 (Dense)             (None, 32)                6432      
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                      

In [15]:
model.fit(X_train_100, y_train_100, validation_data=(val_X, val_seq_Y), epochs=50, batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x203a17d33d0>