In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score

In [2]:
#import data
train_dataset = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_dataset = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [3]:
train_dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test_dataset.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
train_dataset.isnull().sum().sort_values(ascending=False)

location    2533
keyword       61
id             0
text           0
target         0
dtype: int64

In [6]:
test_dataset.isnull().sum().sort_values(ascending=False)

location    1105
keyword       26
id             0
text           0
dtype: int64

In [7]:
train_dataset.dtypes

id           int64
keyword     object
location    object
text        object
target       int64
dtype: object

In [8]:
# Create a binary feature in the train dataset 'has_location' based on whether 'location' is not NaN
train_dataset['has_location'] = pd.notna(train_dataset['location']).astype(int)

# Create a binary feature in the test dataset 'has_location' based on whether 'location' is not NaN
test_dataset['has_location'] = pd.notna(test_dataset['location']).astype(int)

In [9]:
# drop the column 'location' in the train dataset
train_dataset.drop(['location'], axis=1, inplace=True)

# drop the column 'location' in the test dataset
test_dataset.drop(['location'], axis=1, inplace=True)

In [10]:
# Replace NaN values in the 'keyword' column with a placeholder value, e.g., 'unknown'
train_dataset['keyword'].fillna('unknown', inplace=True)
label_encoder = LabelEncoder() # Initialize the LabelEncoder
train_dataset['keyword_encoded'] = label_encoder.fit_transform(train_dataset['keyword']) # Fit and transform the 'keyword' column
train_dataset.drop('keyword', axis=1, inplace=True) # Drop the original 'keyword' column

# Replace NaN values in the 'keyword' column with a placeholder value, e.g., 'unknown'
test_dataset['keyword'].fillna('unknown', inplace=True)
test_dataset['keyword_encoded'] = label_encoder.fit_transform(test_dataset['keyword'])
test_dataset.drop('keyword', axis=1, inplace=True)

In [11]:
train_dataset.head()

Unnamed: 0,id,text,target,has_location,keyword_encoded
0,1,Our Deeds are the Reason of this #earthquake M...,1,0,206
1,4,Forest fire near La Ronge Sask. Canada,1,0,206
2,5,All residents asked to 'shelter in place' are ...,1,0,206
3,6,"13,000 people receive #wildfires evacuation or...",1,0,206
4,7,Just got sent this photo from Ruby #Alaska as ...,1,0,206


In [12]:
test_dataset.head()

Unnamed: 0,id,text,has_location,keyword_encoded
0,0,Just happened a terrible car crash,0,206
1,2,"Heard about #earthquake is different cities, s...",0,206
2,3,"there is a forest fire at spot pond, geese are...",0,206
3,9,Apocalypse lighting. #Spokane #wildfires,0,206
4,11,Typhoon Soudelor kills 28 in China and Taiwan,0,206


In [13]:
# Download NLTK resources
nltk.download('stopwords')

# Initialize the Porter Stemmer and stopwords
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Function for text preprocessing
def preprocess_text(text):
    
    # Remove URLs, special characters, and numbers
    text = re.sub(r'http\S+|www\S+|https\S+|\d+|#[\w]+', '', text).lower()

    # Remove all punctuation marks
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenization and stemming
    words = [ps.stem(word) for word in text.split() if word not in stop_words]

    # Join the words back into a string
    processed_text = ' '.join(words)

    return processed_text

# Apply the preprocessing function to your 'text' column
train_dataset['processed_text'] = train_dataset['text'].apply(preprocess_text)

test_dataset['processed_text'] = test_dataset['text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
# Extract features (X) and target variable (y)
X_train = train_dataset.drop('target', axis=1)  # Features
y_train = train_dataset['target']  # Target variable

X_test = test_dataset

In [15]:
# transform the text data to feature vectors that can be used as input
# TF-IDF vectorization
tfidf_vectorizer = CountVectorizer(min_df=1, stop_words='english', lowercase=True)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['processed_text'])
X_test_tfidf = tfidf_vectorizer.transform(X_test['processed_text'])

In [16]:
print("Number of unique words:", len(tfidf_vectorizer.get_feature_names_out()))

Number of unique words: 12812


In [17]:
X_train.shape

(7613, 5)

In [18]:
X_train_tfidf.shape

(7613, 12812)

In [19]:
X_test.shape

(3263, 5)

In [20]:
X_test_tfidf.shape

(3263, 12812)

In [21]:
# Instantiate Support Vector Machine model
svm_model = SVC()

# Define the hyperparameter grid
param_grid = {
    'C': [0.1, 1, 10],  # You can add more values for C
    'kernel': ['linear', 'rbf'],  # Different kernel options
    'gamma': ['scale', 'auto']  # You can add more values for gamma
}

# Use F1 score as the scoring metric
f1_scorer = make_scorer(f1_score)

# Instantiate GridSearchCV with cross-validation (Stratified K-Fold)
grid_search_svm = GridSearchCV(
    estimator=svm_model,
    param_grid=param_grid,
    cv=StratifiedKFold(n_splits=7, shuffle=True),
    scoring=f1_scorer,
    verbose=1,
    n_jobs=-1  # Use all available CPU cores
)

## Fit the SVM model
grid_search_svm.fit(X_train_tfidf, y_train)

# Get the best hyperparameters
best_params_svm = grid_search_svm.best_params_

# Print the best hyperparameters for SVM
print("Best Hyperparameters for SVM:")
print(best_params_svm)

Fitting 7 folds for each of 12 candidates, totalling 84 fits
Best Hyperparameters for SVM:
{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


In [22]:
model = SVC(C=1, gamma='scale', kernel='rbf')
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

In [23]:
print(y_pred)

[1 0 1 ... 1 1 0]


In [24]:
# Create a submission DataFrame
submission = pd.DataFrame({
    'id': X_test['id'].astype(int),
    'target': y_pred.astype(int)
})

In [25]:
# If they don't match, reset the index in submission_df
submission.reset_index(drop=True, inplace=True)

# Now, try saving the CSV again
submission.to_csv('submission.csv', index=False)