In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Import the needed libraries
import matplotlib.pyplot as plt 
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import emoji
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
nltk.download("stopwords")
nltk.download("wordnet")


In [None]:
#!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
#Upload the training data 
df_train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
df_train.head()
#df_train.shape #7613 rows

df_train["keyword"].isnull().sum

In [None]:
#Find all the duplicated Tweets
df_duplicated = df_train[df_train.duplicated(["text"], keep = False)] #179 rows
df_duplicated.head()

In [None]:
#Find all the tweets that have been labeled more than once
pd.set_option('display.max_colwidth', None) #to display the whole text
df_errors = df_train.groupby('text').filter(lambda x: x['target'].nunique() > 1, display)
df_errors.head(20)

#df_errors.shape 55

In [None]:
#Delete all the duplicates
df_train = df_train.drop_duplicates(subset='text') #to also remove rows that have same text different target
df_train.shape #7503 rows

In [None]:
#Delete all fields with conflicting labels
df_train = pd.concat([df_train, df_errors, df_errors]).drop_duplicates(keep=False)
df_train.shape

In [None]:
#Check that all duplicates are removed
null = df_train["target"].isnull().sum()
print(null)

In [None]:
#Check if there are any Nan in the text column

isna = df_train['text'].isna()
print(isna)

In [None]:
#Text Preprocessing (adapted to clean Twitter text)
import string

stop_words = set(stopwords.words('english')) #gets the stopword list from the dedicated library and saves them
tk = TweetTokenizer() #defines the object, whose method is called in the function
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
        # Check if the input is a string
    if not isinstance(text, str):
        # Return non-string input as-is or convert to string
        return str(text) if text is not None else ''
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove user mentions
    text = re.sub(r"@\S+", "", text)
    # Remove hashtags
    text = re.sub(r"#\S+", "", text)
    # Remove punctuation
    text = re.sub(f"[{string.punctuation}]", "", text)
    # Remove emojis
    text = emoji.emojize(text, variant='emoji_type')
    # Lowercase the text
    text = text.lower()
    # Tokenize the text
    words = tk.tokenize(text)
    # Remove stop words
    words = [w for w in words if w not in stop_words]
    # Join the tokens back together
    return ' '.join(words)
    


In [None]:
#Apply the preprocessing functon to our text

df_train['text'] = df_train['text'].apply(preprocess_text)

In [None]:
print(df_train['target'].isnull().sum())

As emerging from this plot, the classes are fairly balanced

In [None]:
from gensim.models import Word2Vec


from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# Load GloVe model (example with 100-dimensional model)
glove_input_file = '/kaggle/input/glove6b100dtxt/glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)
model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

# Define function to create averaged word vector for a text
def text_to_vector(text):
    words = text.split()
    word_vectors = [model[word] for word in words if word in model]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Apply the function to each row in the DataFrame
df_train['text_vector'] = df_train['text'].apply(lambda x: text_to_vector(x.lower()))

# Ensure that 'text_vector' is a list of lists (or numpy arrays)
df_train['text_vector'] = df_train['text_vector'].apply(lambda x: x if isinstance(x, list) else x.tolist())

In [None]:
df_train.head()

In [None]:
print(set(df_train['target']))

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
# Separate features and labels

X = np.array(df_train['text_vector'].tolist())
y = df_train['target']


# Check the shapes of X and y to ensure they are correct
print(X.shape)  # Should be (n_samples, n_features)
print(y.shape)  # Should be (n_samples,)

In [None]:
#Checking how many labels per class
label_true =  np.sum(y == 1)# 3271
label_false = np.sum(y == 0)

#Plotting the result
plt.figure(figsize=(5,5))
plt.bar(["True","False"], [label_true, label_false])



In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a logistic regression model
clf = LogisticRegression(solver = 'liblinear')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

In [None]:
#Upload the test data
df_test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
df_test.head()

In [None]:
df_test.shape

In [None]:
#Preprocess validation data
df_test['text'] = df_test['text'].apply(preprocess_text)
df_test['text']

In [None]:
#Vectorize the validation data

df_test['text_vector'] = df_test['text'].apply(lambda x: text_to_vector(x.lower()))

# Ensure that 'text_vector' is a list of lists (or numpy arrays)
df_test['text_vector'] = df_test['text_vector'].apply(lambda x: x if isinstance(x, list) else x.tolist())


In [None]:
X_valid = np.array(df_test['text_vector'].tolist())


In [None]:
#Predict the targets
y_valid_preds = clf.predict(X_valid)


In [None]:
#submit the prediction
output = pd.DataFrame({'id': df_test.id, 'target': y_valid_preds})
output.to_csv('submission.csv', index=False)
print("Submission successfully created!")