In [2]:
import pandas as pd
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')

In [3]:
train['Discussion'].isnull().sum()

343

In [4]:
train = train.dropna(subset=['Discussion'])

In [5]:
train['Discussion'].isnull().sum()          # should be 0

0

In [6]:
file = open('/content/ClassesMap.txt','r')
file2 = file.read()
file.close()
file2

'Politics --> 0\nSports --> 1\nMedia --> 2\nMarket & Economy --> 3\nSTEM --> 4'

In [7]:
train['Category'].replace({
    'Politics': 0,
    'Sports': 1,
    'Media': 2,
    'Market & Economy': 3,
    'STEM': 4
}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Category'].replace({
  train['Category'].replace({


In [8]:
train['Category'].head(7)

Unnamed: 0,Category
0,1
1,4
2,4
3,1
4,0
5,2
6,2


In [9]:
! pip install wordninja

Collecting wordninja
  Downloading wordninja-2.0.0.tar.gz (541 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/541.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m532.5/541.6 kB[0m [31m18.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m541.6/541.6 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wordninja
  Building wheel for wordninja (setup.py) ... [?25l[?25hdone
  Created wheel for wordninja: filename=wordninja-2.0.0-py3-none-any.whl size=541530 sha256=bc2a7559dbe7e193abfe579f4a7cf520e17344e401a5c2784bf2faf5394474fa
  Stored in directory: /root/.cache/pip/wheels/aa/44/3a/f2a5c1859b8b541ded969b4cd12d0a58897f12408f4f51e084
Successfully built wordninja
Installing collected packages: wordninja
Successfully installed wordninja-2.0.0


In [10]:
! pip install tensorflow



In [11]:
import re
import wordninja

def preprocess_text(text):
    # 1 lowercase
    text = text.lower()
    # 2 \n
    text = text.replace('\n', ' ')
    # 3 special chars and punctuations
    text = re.sub(r'[^\w\s\d]', '', text)
    # 4 WordNinja to any word of length > 10
    text = ' '.join([
        ' '.join(wordninja.split(word)) if len(word) > 10 else word
        for word in text.split()
    ])
    return text

In [12]:
train['Discussion'] = train['Discussion'].apply(preprocess_text)
test['Discussion'] = test['Discussion'].apply(preprocess_text)

In [13]:
train['Discussion'].iloc[0]

'without sitting down and doing it manually you might try some scheduling software there are several here is one that you can download i havent tried it but it seems to do the job nn http www download com the league system pro 30007427 410505040 html tag pdp prod'

In [14]:
test['Discussion'].iloc[281]

'http www x rates com d usd mxn graph 120 html'

In [15]:
remove_words = ["http", "www", "com"]

def remove_specific_words(text):
    filtered_text = ' '.join([word for word in text.split() if word not in remove_words])
    return filtered_text


train['Discussion'] = train['Discussion'].apply(remove_specific_words)
test['Discussion'] = test['Discussion'].apply(remove_specific_words)


In [16]:
print(train['Discussion'].iloc[0])
print("......................................................................................")
print(test['Discussion'].iloc[281])

without sitting down and doing it manually you might try some scheduling software there are several here is one that you can download i havent tried it but it seems to do the job nn download the league system pro 30007427 410505040 html tag pdp prod
......................................................................................
x rates d usd mxn graph 120 html


In [17]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D, Bidirectional
from tensorflow.keras.utils import to_categorical

# concatinating the 2 columns for word embedding (this stratigy achived the highest acc)
combined = pd.concat([train['Discussion'], test['Discussion']], axis=0)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(combined)

X_train_seq = tokenizer.texts_to_sequences(train['Discussion'])
X_test_seq = tokenizer.texts_to_sequences(test['Discussion'])


max_seq_length = 250
X_train_padded = pad_sequences(X_train_seq, maxlen=max_seq_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_seq_length, padding='post')

# encoding
y_train = to_categorical(train['Category'].values)

# train_test split
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_padded, y_train, test_size=0.2, random_state=42, stratify=train['Category']
)

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Define CNN Model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 128

model = Sequential([
    # Embedding Layer
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_length),

    # Convolutional Layer
    Conv1D(filters=128, kernel_size=5, activation='relu', padding='same'),
    MaxPooling1D(pool_size=2),

    # Add another Convolutional Layer for deeper features (optional)
    Conv1D(filters=64, kernel_size=3, activation='relu', padding='same'),
    MaxPooling1D(pool_size=2),

    # Global Pooling Layer
    GlobalMaxPooling1D(),

    # Fully Connected Dense Layer
    Dense(64, activation='relu'),
    Dropout(0.4),

    # Output Layer
    Dense(y_train_final.shape[1], activation='softmax')  # Assuming one-hot encoded labels
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Add Early Stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
batch_size = 64
epochs = 20

history = model.fit(
    X_train_final, y_train_final,
    validation_data=(X_val, y_val),
    batch_size=batch_size,
    epochs=epochs,
    # callbacks=[early_stopping],
    verbose=1
)


Epoch 1/20




[1m309/309[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 418ms/step - accuracy: 0.3113 - loss: 1.4927 - val_accuracy: 0.6359 - val_loss: 0.9600
Epoch 2/20
[1m309/309[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 430ms/step - accuracy: 0.7281 - loss: 0.7788 - val_accuracy: 0.6748 - val_loss: 0.8763
Epoch 3/20
[1m309/309[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 424ms/step - accuracy: 0.8451 - loss: 0.4673 - val_accuracy: 0.6613 - val_loss: 0.9650
Epoch 4/20
[1m309/309[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 423ms/step - accuracy: 0.9163 - loss: 0.2704 - val_accuracy: 0.6554 - val_loss: 1.2465
Epoch 5/20
[1m309/309[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 425ms/step - accuracy: 0.9467 - loss: 0.1699 - val_accuracy: 0.6452 - val_loss: 1.3813
Epoch 6/20
[1m309/309[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 498ms/step - accuracy: 0.9638 - loss: 0.1137 - val_accuracy: 0.6491 - val_loss: 1.5561
Epoch 7/20
[1m

In [19]:
test_predictions = model.predict(X_test_padded)
test['Category'] = np.argmax(test_predictions, axis=1)


submission_df = test[['SampleID', 'Category']]
submission_df.to_csv('LSTM_submission.csv', index=False)

[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 41ms/step


In [23]:
from google.colab import files

# Save the model first
model.save('CNN.h5')
# Download the saved model file
files.download('CNN.h5')




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>