<a href="https://colab.research.google.com/github/data-with-shobhit/Misc/blob/main/Task_2_Text_Classification_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

NOTE: RUN ONCE PER SEESION AND RESTART RUNTIME AFTER INSTALLATION TO LOAD DEPENDENCIES.

In [None]:
import spacy
print("Success" if spacy.util.is_package("en_core_web_md") else spacy.cli.download("en_core_web_md"))

In [45]:
import pandas as pd
import numpy as np
import re
import string
import random
import joblib
import spacy
from joblib import dump
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle

In [46]:
# Load Dataset
data = 'https://drive.google.com/file/d/11dJoMzkR1F0f8uJEYGhjtUHXNpT3eTML/view?usp=drive_link'

data = 'https://drive.google.com/uc?id=' + data.split('/')[-2]

In [47]:
df=pd.read_csv(data)

In [48]:
len(df)

100000

In [49]:
df.head()

Unnamed: 0,text
0,Turn on the profile picture guard to make your...
1,►►►hier klicken: http://bit.ly/freiheitsdressu...
2,"Weekend deal alert! Outdo Santa, today only, w..."
3,THIS TEENAGE GIRL SHARES THE BIGGEST SECRET OF...
4,Easy & convenient access to professional guida...


In [50]:
df=df.dropna()
df=df.drop_duplicates()

In [51]:
len(df)

79177

In [52]:
df.isna().sum()

Unnamed: 0,0
text,0


In [53]:
df.nunique()

Unnamed: 0,0
text,79177


In [54]:
def clean_text(text):

    text = str(text).lower()

    text = re.sub(r"http\S+|www.\S+", "", text)

    text = re.sub(r'\b\d{10}\b', '', text)

    text = text.encode('ascii', 'ignore').decode()

    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove emojis

    emoji_pattern = re.compile(

        "["
        u"\U0001F600-\U0001F64F"  # Emoticons
        u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # Transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # Flags (iOS)
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)

    # Replace multiple spaces with a single space and strip leading/trailing spaces
    text = re.sub(r'\s+', ' ', text).strip()


    return text.lower()

In [55]:
cleaned_df= df.copy()

In [56]:
cleaned_df['text']=cleaned_df['text'].apply(clean_text)

In [57]:
keywords = {
    'Ecommerce': [
        'cart', 'checkout', 'wishlist', 'payment', 'delivery', 'shipping',
        'return', 'refund', 'order', 'sale', 'promo', 'coupon', 'store',
        'online', 'product', 'customer', 'review', 'inventory', 'stock',
        'secure payment', 'limited offer', 'bestseller', 'deal of the day',
        'loyalty', 'membership', 'order tracking', 'cash on delivery',
        'return policy', 'invoice', 'packaging', 'customer service',
        'gift card', 'bulk order', 'size chart', 'flash deal', 'subscription',
        'discount', 'free shipping', 'exchange', 'preorder', 'shopping cart',
        'checkout process', 'secure checkout', 'delivery status', 'fast shipping',
        'product details', 'add to cart', 'limited time offer', 'cashback',
        'one-day delivery', 'EMI options', 'payment gateway', 'user reviews',
        'compare prices', 'ecommerce trends', 'trustworthy seller', 'wholesale',
        'marketplace', 'shop now', 'clearance sale', 'daily deals', 'gift wrapping',
        'customer feedback', 'loyalty points', 'redeem points', 'online store',
        'store locator', 'catalog', 'order confirmation', 'track package',
        'restock alert', 'brand authenticity', 'fraud protection', 'secure transaction',
        'payment options', 'return authorization', 'customer complaint', 'order cancellation',
        'next-day shipping', 'affiliate program', 'cart abandonment', 'mobile shopping',
        'shipping address', 'express delivery', 'shopping festival', 'limited edition',
        'seller rating', 'wish list', 'fast checkout', 'guaranteed delivery','deal','deals'
    ],

    'Education': [
        'learn', 'study', 'course', 'class', 'teacher', 'student', 'school',
        'college', 'university', 'exam', 'test', 'assignment', 'certificate',
        'diploma', 'online course', 'lecture', 'syllabus', 'enroll', 'tuition',
        'e-learning', 'MOOC', 'webinar', 'training', 'quiz', 'academic',
        'library', 'research', 'skill', 'knowledge', 'mentor', 'online learning',
        'virtual class', 'internship', 'campus', 'higher education', 'degree',
        'scholarship', 'homework', 'tutorial', 'practical', 'project-based learning',
        'career guidance', 'study material', 'education system', 'subject specialization',
        'learning management system', 'self-paced learning', 'teaching method',
        'grading system', 'course module', 'educational technology', 'peer learning',
        'exam preparation', 'test series', 'competitive exam', 'scholarship program',
        'thesis', 'dissertation', 'internship program', 'language learning',
        'student support', 'tuition fees', 'lecture notes', 'group study',
        'learning outcomes', 'case studies', 'problem-solving skills',
        'soft skills training', 'distance education', 'interactive learning',
        'education fair', 'capstone project', 'study abroad', 'school admission',
        'career development', 'knowledge base', 'alumni network', 'study schedule',
        'learning curve', 'skill enhancement', 'coursework', 'teacher training',
        'e-textbooks', 'adaptive learning', 'education policy', 'faculty training'
    ],

    'Technology': [
        'software', 'hardware', 'AI', 'machine learning', 'data', 'cloud',
        'server', 'database', 'network', 'cybersecurity', 'coding', 'developer',
        'app', 'website', 'blockchain', 'robotics', 'automation', 'IoT',
        'virtual reality', 'augmented reality', 'API', 'IT', 'tech', 'smartphone',
        'laptop', 'processor', 'GPU', 'platform', 'analytics', 'cloud computing',
        'data science', 'big data', 'wearable tech', 'SaaS', 'DevOps', 'Git',
        'artificial intelligence', 'deep learning', 'predictive analytics',
        'business intelligence', 'web development', 'frontend', 'backend',
        'mobile app', 'AI ethics', 'AI-powered', 'data pipeline', 'data engineer',
        'cloud services', 'software architecture', 'agile development', 'CI/CD',
        'UX/UI design', 'virtual assistant', 'computer vision', 'image recognition',
        '5G technology', 'quantum computing', 'big data analytics', 'network security',
        'penetration testing', 'ethical hacking', 'cloud storage', 'VR headset',
        'AI chatbot', 'smart assistant', 'microservices', 'containerization',
        'data governance', 'self-driving cars', 'embedded systems', 'open source',
        'AI-powered tools', 'automated testing', 'cyber attack', 'firewall',
        'AI-generated content', 'mobile computing', 'hyperautomation', 'no-code',
        'low-code', 'privacy policy', 'blockchain security', 'autonomous systems',
    ],



    'Healthcare': [
        'doctor', 'hospital', 'medicine', 'health', 'wellness', 'patient',
        'treatment', 'therapy', 'diagnosis', 'surgery', 'clinic', 'pharmacy',
        'vaccine', 'mental health', 'nutrition', 'fitness', 'telemedicine',
        'nursing', 'first aid', 'emergency', 'healthcare provider', 'insurance',
        'medical records', 'prescription', 'rehabilitation', 'physical therapy',
        'symptoms', 'chronic disease', 'cardiology', 'oncology', 'neurology',
        'pediatrics', 'dermatology', 'radiology', 'lab test', 'X-ray', 'MRI',
        'blood pressure', 'diabetes', 'hypertension', 'stroke', 'cancer',
        'surgery recovery', 'vaccination', 'preventive care', 'health monitoring',
        'electronic health records', 'urgent care', 'pandemic', 'epidemic',
        'public health', 'medical research', 'clinical trials', 'genetics',
        'biotechnology', 'medical device', 'wearable health tech', 'telehealth',
        'fitness tracker', 'nutritional supplement', 'alternative medicine',
        'holistic health', 'home remedies', 'sleep disorders', 'mental well-being',
        'stress management', 'mindfulness', 'physical fitness', 'workout',
        'yoga', 'diet plan', 'weight loss', 'healthy lifestyle', 'medical consultation',
        'pharmaceuticals', 'natural remedies', 'exercise', 'protein supplements',
        'immune system', 'nurse practitioner', 'urgent surgery', 'lab diagnostics',
        'prenatal care', 'postnatal care', 'homeopathy', 'osteopathy',
        'orthopedics', 'dental care', 'eye care', 'skin care', 'cosmetic surgery',
        'healthcare AI', 'healthcare robotics', 'hospital management', 'patient portal'
    ],

    'Finance': [
        'banking', 'investment', 'loans', 'mortgage', 'credit', 'debt',
        'budgeting', 'savings', 'insurance', 'stock market', 'mutual funds',
        'trading', 'cryptocurrency', 'bitcoin', 'Ethereum', 'personal finance',
        'financial planning', 'retirement', 'pension', 'interest rates', 'credit score',
        'credit card', 'debit card', 'tax', 'tax return', 'audit', 'wealth management',
        'portfolio', 'dividend', 'passive income', 'inflation', 'recession',
        'financial literacy', 'accounting', 'CPA', 'bookkeeping', 'invoice',
        'money transfer', 'wire transfer', 'digital banking', 'mobile banking',
        'RBI regulations', 'financial fraud', 'investment portfolio', 'hedge funds',
        'venture capital', 'private equity', 'loan approval', 'mortgage refinance',
        'small business loans', 'angel investor', 'financial risk', 'money market',
        'economic growth', 'fiscal policy', 'monetary policy', 'GDP', 'income tax',
        'business valuation', 'insurance policy', 'credit union', 'fintech',
        'wealth advisors', 'retirement savings', 'financial independence',
        'credit report', 'cash flow', 'payday loans', 'emergency funds', 'bonds',
        'real estate investment', 'forex trading', 'derivatives', 'options trading',
        'economic indicators', 'financial security', 'budget planning', 'money management',
        'secured loans', 'unsecured loans', 'hedging', 'market trends', 'gold investment',
        'asset allocation', 'robo-advisors', 'crypto wallet', 'decentralized finance',
        'liquidity', 'financial inclusion', 'stock exchange', 'exchange-traded funds (ETFs)'
    ],



    'Entertainment': [
        'movies', 'TV shows', 'music', 'concerts', 'theater', 'celebrities',
        'Hollywood', 'Bollywood', 'Netflix', 'Hulu', 'Amazon Prime', 'Disney+',
        'sports', 'video games', 'gaming', 'streaming', 'YouTube', 'TikTok',
        'comedy', 'stand-up', 'podcast', 'radio', 'animation', 'anime',
        'manga', 'film festivals', 'cinema', 'independent films', 'box office',
        'director', 'screenplay', 'acting', 'reality TV', 'awards shows',
        'Grammy Awards', 'Oscars', 'Emmy Awards', 'celebrity gossip',
        'music festivals', 'Coachella', 'rock music', 'pop music', 'hip-hop',
        'jazz', 'classical music', 'K-pop', 'live performance', 'virtual concerts',
        'e-sports', 'gaming tournaments', 'VR gaming', 'music streaming',
        'Spotify', 'Apple Music', 'gaming consoles', 'PlayStation', 'Xbox',
        'Nintendo', 'comic books', 'graphic novels', 'novels', 'fiction',
        'fantasy series', 'superheroes', 'Marvel', 'DC Comics', 'indie music',
        'dance', 'ballet', 'musical', 'opera', 'soundtracks', 'movie trailers',
        'film reviews', 'critic ratings', 'box office collection', 'TV ratings',
        'reality shows', 'theme parks', 'Broadway', 'live theater', 'variety shows',
        'stand-up specials', 'film adaptation', 'book adaptation', 'fan theories',
        'trending entertainment news', 'celebrity interviews', 'behind the scenes',
    ],

    'Travel': [
        'vacation', 'flight', 'hotel', 'booking', 'trip', 'adventure', 'destination',
        'sightseeing', 'tourism', 'passport', 'visa', 'road trip', 'airbnb', 'hostel',
        'cruise', 'beach', 'mountains', 'travel guide', 'itinerary', 'budget travel',
        'solo travel', 'family trip', 'honeymoon', 'resort', 'all-inclusive',
        'backpacking', 'camping', 'hiking', 'travel insurance', 'car rental',
        'travel blog', 'safari', 'eco-tourism', 'luxury travel', 'cultural tourism',
        'travel deals', 'last-minute travel', 'travel agency', 'tour packages',
        'city tour', 'road trip planner', 'weekend getaway', 'airfare', 'cheap flights',
        'budget airlines', 'business travel', 'travel photography', 'holiday season',
        'best places to visit', 'local cuisine', 'souvenirs', 'travel hacks',
        'public transport', 'train travel', 'bus travel', 'cross-country', 'glamping',
        'remote work travel', 'digital nomad', 'workation', 'staycation', 'resort stay',
        'yacht trip', 'airline miles', 'frequent flyer', 'overseas travel',
        'hidden gems', 'historical sites', 'monuments', 'theme parks', 'national parks',
        'travel vlogger', 'travel safety', 'packing tips', 'language barrier',
        'travel visa', 'best travel credit cards', 'luxury resorts', 'offbeat destinations',
        'group travel', 'road map', 'climate impact travel', 'eco-friendly travel',
        'adventure sports', 'paragliding', 'scuba diving', 'skiing', 'bungee jumping',
        'hot air balloon', 'travel disruptions', 'customs regulations', 'city breaks',
        'food tourism', 'romantic destinations', 'airline lounge access',
        'airport tips', 'cultural experiences', 'hidden travel costs', 'pet-friendly travel'
    ],

    'Sports': [
        'football', 'soccer', 'basketball', 'tennis', 'cricket', 'baseball', 'golf',
        'rugby', 'badminton', 'hockey', 'volleyball', 'table tennis', 'swimming',
        'athletics', 'track and field', 'marathon', 'triathlon', 'cycling', 'gymnastics',
        'boxing', 'MMA', 'wrestling', 'karate', 'judo', 'taekwondo', 'archery', 'fencing',
        'skiing', 'snowboarding', 'surfing', 'skateboarding', 'motorsports', 'Formula 1',
        'NASCAR', 'MotoGP', 'horse racing', 'rowing', 'kayaking', 'canoeing',
        'rock climbing', 'mountaineering', 'bouldering', 'weightlifting', 'powerlifting',
        'strongman', 'bodybuilding', 'crossfit', 'yoga', 'pilates', 'aerobics',
        'esports', 'gaming tournaments', 'Olympics', 'Paralympics', 'World Cup',
        'Premier League', 'Champions League', 'NBA', 'NFL', 'MLB', 'NHL', 'UFC',
        'Wimbledon', 'French Open', 'US Open', 'Australian Open', 'Tour de France',
        'WrestleMania', 'Super Bowl', 'Grand Slam', 'sports injuries', 'training',
        'coaching', 'sports nutrition', 'fitness', 'gym', 'personal training',
        'cardio', 'strength training', 'agility', 'endurance', 'mental toughness',
        'sports psychology', 'hydration', 'warm-up', 'cool-down', 'stretching',
        'team sports', 'individual sports', 'referee', 'umpire', 'goalkeeper',
        'striker', 'defender', 'midfielder', 'pitch', 'court', 'stadium', 'league',
        'tournament', 'medal', 'trophy', 'world rankings', 'records', 'sports betting',
        'fantasy sports', 'fan engagement', 'sports journalism', 'sports broadcasting',
        'sports analytics', 'VAR', 'sports technology', 'wearable fitness trackers','fitness coach','fitness'
    ],

    'News': [
        'breaking news', 'headlines', 'daily news', 'current affairs', 'global news',
        'politics', 'elections', 'government', 'laws', 'policies', 'international relations',
        'war', 'conflict', 'peace talks', 'diplomacy', 'economy', 'stock market',
        'inflation', 'interest rates', 'business news', 'corporate news', 'startups',
        'mergers and acquisitions', 'trade', 'cryptocurrency', 'technology news',
        'AI advancements', 'cybersecurity', 'gadgets', 'science news', 'space exploration',
        'NASA', 'climate change', 'environment', 'natural disasters', 'wildfires',
        'hurricanes', 'earthquakes', 'floods', 'health news', 'pandemic', 'vaccines',
        'mental health', 'medical research', 'new diseases', 'sports news',
        'match highlights', 'tournament updates', 'player transfers', 'Olympics',
        'entertainment news', 'movies', 'TV shows', 'Hollywood', 'Bollywood',
        'celebrity gossip', 'award shows', 'music industry', 'fashion trends',
        'crime news', 'fraud', 'scams', 'corruption', 'terrorism', 'court cases',
        'legal news', 'human rights', 'social justice', 'protests', 'activism',
        'education news', 'exams', 'admissions', 'university rankings',
        'career news', 'job market', 'remote work trends', 'real estate market',
        'housing prices', 'transportation news', 'aviation', 'public transport updates',
        'electric vehicles', 'fuel prices', 'space news', 'Mars mission', 'satellites',
        'military news', 'defense policies', 'nuclear weapons', 'historic events',
        'anniversaries', 'editorials', 'opinion pieces', 'fact-checking', 'fake news',
        'media ethics', 'journalism', 'press freedom', 'censorship', 'global summits',
        'G20', 'UN meetings', 'economic forums', 'local news', 'community updates'
    ]


}



In [58]:
# Function to categorize text based of keywords

def labels(text):
    words = set(re.findall(r'\b\w+\b', text))
    for category, keyword in keywords.items():
        if any(word in text for word in keyword):
            return category
    return 'Other'

In [59]:
cleaned_df['label']=cleaned_df['text'].apply(labels)

In [None]:
cleaned_df

In [60]:
cleaned_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Other,37334
Ecommerce,18179
Education,7313
Technology,5758
Entertainment,2514
Healthcare,2376
Travel,2116
News,1625
Finance,1017
Sports,945


In [61]:
#Creating new dataset for training purpose with equally sample size of each categories
balanced_df = cleaned_df.groupby('label').apply(lambda x: x.sample(n=940, random_state=42, replace=False)).reset_index(drop=True)


print(balanced_df['label'].value_counts())


label
Ecommerce        940
Education        940
Entertainment    940
Finance          940
Healthcare       940
News             940
Other            940
Sports           940
Technology       940
Travel           940
Name: count, dtype: int64


  balanced_df = cleaned_df.groupby('label').apply(lambda x: x.sample(n=940, random_state=42, replace=False)).reset_index(drop=True)


In [62]:
balanced_df.head()

Unnamed: 0,text,label
0,figuactiv 28 tage body mission expert program ...,Ecommerce
1,do you want to outrank your competitors and ge...,Ecommerce
2,with oldfashioned home security you could be l...,Ecommerce
3,hey kamloops have you seen our glowmen around ...,Ecommerce
4,designed and made in great britain our wax cot...,Ecommerce


In [70]:
balanced_df.head()

Unnamed: 0,text,label,label_num
0,figuactiv tage body mission expert program fig...,Ecommerce,1
1,want outrank competitor customer call run free...,Ecommerce,1
2,oldfashioned home security leave wait week ins...,Ecommerce,1
3,hey kamloops see glowman town prize pocket sur...,Ecommerce,1
4,design great britain wax cotton jacket design ...,Ecommerce,1


In [63]:
# mapping category labels to numerical values.

balanced_df['label_num'] = balanced_df.label.map({"Education": 0, "Ecommerce": 1, "Technology": 2, "Healthcare": 3, "Entertainment": 4, "Finance": 5, "News": 6,
"Travel": 7, "Sports": 8, "Other": 9})

In [64]:
"""
    Function to preprocess text using SpaCy:
    - Tokenizes the text
    - Removes stopwords, punctuation, and non-alphabetic tokens
    - Applies lemmatization to get the base form of words
    - Returns the cleaned text as a string

"""


nlp = spacy.load("en_core_web_md")

def pre_process_text(text):

    doc = nlp(text)

    filtered_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.is_alpha]

    return " ".join(filtered_tokens)


In [65]:
balanced_df['text']=balanced_df['text'].apply(pre_process_text)

In [66]:
x_train, x_test, y_train, y_test = train_test_split(balanced_df['text'], balanced_df['label_num'], test_size=0.2, random_state=42)

In [67]:
# Converting text data to TF-IDF features

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

x_train_tfidf = tfidf.fit_transform(x_train)

x_test_tfidf = tfidf.transform(x_test)


joblib.dump(tfidf, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

In [68]:
# Training

models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42),
    "NaiveBayes": MultinomialNB()
}

for name, model in models.items():
  print(f" Training {name}...")
  model.fit(x_train_tfidf, y_train)
  preds = model.predict(x_test_tfidf)

  accuracy = accuracy_score(y_test, preds)
  print(f"{name} Accuracy: {accuracy:.4f}")
  print(classification_report(y_test, preds))

  with open(f"{name}_model.pkl", "wb") as f:
    joblib.dump(model, f)


 Training RandomForest...
RandomForest Accuracy: 0.7606
              precision    recall  f1-score   support

           0       0.94      0.71      0.81       192
           1       0.92      0.78      0.85       182
           2       0.88      0.64      0.74       188
           3       0.85      0.86      0.85       196
           4       0.72      0.68      0.70       166
           5       0.93      0.81      0.86       190
           6       0.77      0.67      0.72       184
           7       0.85      0.83      0.84       202
           8       0.86      0.74      0.80       192
           9       0.41      0.88      0.56       188

    accuracy                           0.76      1880
   macro avg       0.81      0.76      0.77      1880
weighted avg       0.82      0.76      0.77      1880

 Training XGBoost...
XGBoost Accuracy: 0.7569
              precision    recall  f1-score   support

           0       0.93      0.75      0.83       192
           1       0.97      0

*RandomForest with an accuracy of 0.7606*