In [None]:
import pandas as pd
train = pd.read_csv('/kaggle/input/data-1/train.csv')
test = pd.read_csv('/kaggle/input/data-1/test.csv')

In [None]:
train['Discussion'].isnull().sum()

343

In [None]:
train = train.dropna(subset=['Discussion'])

In [None]:
train['Discussion'].isnull().sum()          # should be 0

0

In [None]:
file = open('/kaggle/input/data-1/ClassesMap.txt','r')
file2 = file.read()
file.close()
file2

'Politics --> 0\nSports --> 1\nMedia --> 2\nMarket & Economy --> 3\nSTEM --> 4'

In [None]:
train['Category'].replace({
    'Politics': 0,
    'Sports': 1,
    'Media': 2,
    'Market & Economy': 3,
    'STEM': 4
}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Category'].replace({
  train['Category'].replace({


In [None]:
train['Category'].head(7)

0    1
1    4
2    4
3    1
4    0
5    2
6    2
Name: Category, dtype: int64

In [None]:
! pip install wordninja

Collecting wordninja
  Downloading wordninja-2.0.0.tar.gz (541 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m541.6/541.6 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: wordninja
  Building wheel for wordninja (setup.py) ... [?25ldone
[?25h  Created wheel for wordninja: filename=wordninja-2.0.0-py3-none-any.whl size=541530 sha256=5fe9833139a9d46aa99e1f34be6796dac5ad355dbf6807fc1c0a1a8e1c6dde09
  Stored in directory: /root/.cache/pip/wheels/aa/44/3a/f2a5c1859b8b541ded969b4cd12d0a58897f12408f4f51e084
Successfully built wordninja
Installing collected packages: wordninja
Successfully installed wordninja-2.0.0


In [None]:
! pip install tensorflow



In [None]:
import re
import wordninja

def preprocess_text(text):
    # 1 lowercase
    text = text.lower()
    # 2 \n
    text = text.replace('\n', ' ')
    # 3 special chars and punctuations
    text = re.sub(r'[^\w\s\d]', '', text)
    # 4 WordNinja to any word of length > 10
    text = ' '.join([
        ' '.join(wordninja.split(word)) if len(word) > 10 else word
        for word in text.split()
    ])
    return text

In [None]:
train['Discussion'] = train['Discussion'].apply(preprocess_text)
test['Discussion'] = test['Discussion'].apply(preprocess_text)

In [None]:
train['Discussion'].iloc[0]

'without sitting down and doing it manually you might try some scheduling software there are several here is one that you can download i havent tried it but it seems to do the job nn http www download com the league system pro 30007427 410505040 html tag pdp prod'

In [None]:
test['Discussion'].iloc[281]

'http www x rates com d usd mxn graph 120 html'

In [None]:
remove_words = ["http", "www", "com"]

def remove_specific_words(text):
    filtered_text = ' '.join([word for word in text.split() if word not in remove_words])
    return filtered_text


train['Discussion'] = train['Discussion'].apply(remove_specific_words)
test['Discussion'] = test['Discussion'].apply(remove_specific_words)


In [None]:
print(train['Discussion'].iloc[0])
print("......................................................................................")
print(test['Discussion'].iloc[281])

without sitting down and doing it manually you might try some scheduling software there are several here is one that you can download i havent tried it but it seems to do the job nn download the league system pro 30007427 410505040 html tag pdp prod
......................................................................................
x rates d usd mxn graph 120 html


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D, Bidirectional
from tensorflow.keras.utils import to_categorical

# concatinating the 2 columns for word embedding (this stratigy achived the highest acc)
combined = pd.concat([train['Discussion'], test['Discussion']], axis=0)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(combined)

X_train_seq = tokenizer.texts_to_sequences(train['Discussion'])
X_test_seq = tokenizer.texts_to_sequences(test['Discussion'])


max_seq_length = 250
X_train_padded = pad_sequences(X_train_seq, maxlen=max_seq_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_seq_length, padding='post')

# encoding
y_train = to_categorical(train['Category'].values)

# train_test split
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_padded, y_train, test_size=0.2, random_state=42, stratify=train['Category']
)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# LSTM
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 128

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_length),
    SpatialDropout1D(0.3),
    Bidirectional(LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(y_train.shape[1], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# early stopping but not used this time
# early_stopping = EarlyStopping(
#     monitor='val_loss',
#     patience=5,
#     restore_best_weights=True
# )


batch_size = 64
epochs = 20

history = model.fit(
    X_train_final, y_train_final,
    validation_data=(X_val, y_val),
    batch_size=batch_size,
    epochs=epochs,
    #callbacks=[early_stopping],
    verbose=1
)



Epoch 1/20
[1m309/309[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m306s[0m 962ms/step - accuracy: 0.2213 - loss: 1.6047 - val_accuracy: 0.2280 - val_loss: 1.6022
Epoch 2/20
[1m309/309[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m296s[0m 958ms/step - accuracy: 0.2317 - loss: 1.6002 - val_accuracy: 0.2292 - val_loss: 1.5994
Epoch 3/20
[1m309/309[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m296s[0m 957ms/step - accuracy: 0.2277 - loss: 1.5980 - val_accuracy: 0.2294 - val_loss: 1.5971
Epoch 4/20
[1m309/309[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m296s[0m 959ms/step - accuracy: 0.2273 - loss: 1.5933 - val_accuracy: 0.2217 - val_loss: 1.5983
Epoch 5/20
[1m309/309[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m296s[0m 957ms/step - accuracy: 0.2346 - loss: 1.5866 - val_accuracy: 0.2221 - val_loss: 1.6058
Epoch 6/20
[1m309/309[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m295s[0m 954ms/step - accuracy: 0.2333 - loss: 1.5820 - val_accuracy: 0.2237 - val_loss: 1.6005
Epoc

In [None]:
test_predictions = model.predict(X_test_padded)
test['Category'] = np.argmax(test_predictions, axis=1)


submission_df = test[['SampleID', 'Category']]
submission_df.to_csv('LSTM_submission.csv', index=False)

[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 194ms/step
