Training the tf-idf vectorizer

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd

In [43]:
import joblib

In [6]:
import re
import nltk
#from sklearn.pipeline import Pipeline
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import classification_report
import pandas as pd
from langdetect import detect, DetectorFactory
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

DetectorFactory.seed = 0

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Define each preprocessing function

def remove_noise(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def remove_emojis_and_links(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags
                               "]+", flags=re.UNICODE)
    link_pattern = re.compile(r'http\S+|www\S+')
    text = emoji_pattern.sub(r'', text)
    text = link_pattern.sub(r'', text)
    return text

def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False  # Return False if detection fails

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

def stem_text(text):
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

def text_preprocessing(text):
    # Check if text is in English
    if not is_english(text):
        return ""
    
    # Apply all preprocessing steps
    text = text.lower()                         # Normalization
    text = remove_noise(text)                   # Remove noise
    text = remove_emojis_and_links(text)        # Remove emojis and links
    text = remove_stopwords(text)               # Remove stopwords
    text = stem_text(text)                      # Apply stemming
    
    return text



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ervinballa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/ervinballa/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ervinballa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df = pd.read_csv('dataset.csv')

# Select only the columns you need
df = df[['stemmed_text', 'issue_label']]

# Drop any rows with missing values in these columns, if necessary
df.dropna(subset=['stemmed_text', 'issue_label'], inplace=True)

# Separate features and labels
X = df['stemmed_text']  # Text column
y = df['issue_label']   # Label column

# Split data into train, validation, and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.2, random_state=42, stratify=y_train_val
)


In [7]:
vectorizer = TfidfVectorizer(preprocessor=text_preprocessing, max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)

In [9]:
print(vectorizer.vocabulary_)



In [42]:
all_feature_names = vectorizer.get_feature_names_out()

for word in all_feature_names:
    indx = vectorizer.vocabulary_.get(word)
    if word == "continu":
        print(f"{word} {vectorizer.idf_[indx]}")

continu 5.3150940870474495


In [44]:
joblib.dump(vectorizer, 'vectorizer5k.pkl')

['vectorizer5k.pkl']