# NLP Kaggle Bootcamp The Basics of ML

<img src = "https://www.mdpi.com/files/multidisciplinary_topic_graphical_abstract/1949/WeChat%20Image_20240829171202.jpg">



## Problemin Tanımı
Bu proje, **Kaggle Bootcamp: The Basics of ML** yarışması kapsamında bir sınıflandırma modelidir.


# Veri Yükleme


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

DATA_DIR = '../data'
TRAIN_PATH = os.path.join(DATA_DIR, 'airline_tweets_train.csv')
TEST_PATH = os.path.join(DATA_DIR, 'airline_tweets_test.csv')


In [3]:
try:
    df_train = pd.read_csv(TRAIN_PATH)
    df_test = pd.read_csv(TEST_PATH)
    print(f'Train Shape: {df_train.shape}')
    print(f'Test Shape: {df_test.shape}')
except FileNotFoundError:
    print('HATA: Veri dosyaları bulunamadı.')
    df_train = pd.DataFrame()
    df_test = pd.DataFrame()


Train Shape: (4008, 6)
Test Shape: (1355, 6)


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4008 entries, 0 to 4007
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Id                 4008 non-null   int64 
 1   airline_sentiment  4008 non-null   object
 2   airline            4008 non-null   object
 3   text               4008 non-null   object
 4   retweet_count      4008 non-null   int64 
 5   user_timezone      2728 non-null   object
dtypes: int64(2), object(4)
memory usage: 188.0+ KB


In [6]:
df_train.head()

Unnamed: 0,Id,airline_sentiment,airline,text,retweet_count,user_timezone
0,1450,negative,American,@AmericanAir thanks for the response. I know i...,0,
1,7983,positive,US Airways,@USAirways I'm enjoying my flights so far! You...,0,London
2,6479,positive,United,Hey @united you've upgraded me on a 10 hour In...,0,Amsterdam
3,2478,negative,United,"@united ""where we trick you into making us loo...",1,
4,6366,positive,United,@united it was delivered! Thank you for making...,0,


In [10]:
import pandas as pd
import re
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report  # classification_report eklendi

# Eğer NLTK verisi yoksa indirir (Kaggle'da internet açık olmalı)
# nltk.download('punkt') 

# Veri İnceleme & Preprocessing


In [9]:


# 1. Veriyi Hazırla
target_col = 'airline_sentiment'
text_col = 'text'

# Veri kontrolü ve yükleme
if not df_train.empty:
    X = df_train[text_col]
    y = df_train[target_col]
    
    # Train/Validation ayrımı
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # 2.  Tokenizer (Stemming için)
    stemmer = PorterStemmer()
    
    def stemming_tokenizer(text):
        # Sadece harfleri al
        text = re.sub(r"[^a-zA-Z]", " ", text)
        # Kelimelere böl
        tokens = text.split()
        # Stemming uygula (Flying -> Fli)
        stems = [stemmer.stem(word) for word in tokens]
        return stems

    # 3. Pipeline
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            tokenizer=stemming_tokenizer, 
            lowercase=True,               
            stop_words='english',         
            ngram_range=(1, 2),           
            max_features=5000             
        )),
        ('clf', LogisticRegression(max_iter=1000))
    ])

    print('Model Eğitiliyor...')
    pipeline.fit(X_train, y_train)
    
    # 4. Değerlendirme
    preds = pipeline.predict(X_val)
    
    print("-" * 30)
    print(f'Accuracy Score: {accuracy_score(y_val, preds):.4f}')
    print("-" * 30)
    print('Classification Report:')
    print(classification_report(y_val, preds))
    print("-" * 30)

Model Eğitiliyor...
------------------------------
Accuracy Score: 0.8766
------------------------------
Classification Report:
              precision    recall  f1-score   support

    negative       0.85      0.94      0.89       442
    positive       0.92      0.80      0.85       360

    accuracy                           0.88       802
   macro avg       0.88      0.87      0.87       802
weighted avg       0.88      0.88      0.88       802

------------------------------


# Model Saving &Submission


In [18]:
import joblib
import os
import pandas as pd

# --- AYARLAR ---
# Notebook bir alt klasörde olduğu için (örn: /notebooks), 
# bir üst dizine çıkmak için '../' kullanıyoruz.
MODELS_DIR = '../models'
OUTPUTS_DIR = '../outputs'

# Klasörlerin varlığını garantiye alalım (Zaten varsa hata vermez)
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(OUTPUTS_DIR, exist_ok=True)

# 1. Modeli Kaydet (../models/best_model.pkl)
if not df_train.empty:
    model_path = os.path.join(MODELS_DIR, 'best_model.pkl')
    joblib.dump(pipeline, model_path)
    print(f'✅ Model başarıyla kaydedildi: {os.path.abspath(model_path)}')
    
    # 2. Test Verisi Üzerinde Tahmin Yap
    if not df_test.empty:
        # Pipeline ham metni bekliyor
        X_test = df_test['text'].fillna('') 
        
        final_preds = pipeline.predict(X_test)
        
        # 3. Submission Dosyasını Oluştur (../outputs/submission.csv)
        submission = pd.DataFrame({
            'Id': df_test['Id'],
            'airline_sentiment': final_preds
        })
        
        submission_path = os.path.join(OUTPUTS_DIR, 'submission.csv')
        submission.to_csv(submission_path, index=False)
        print(f'✅ Submission dosyası başarıyla oluşturuldu: {os.path.abspath(submission_path)}')
        
        # İlk 5 satırı göster
        display(submission.head())

✅ Model başarıyla kaydedildi: C:\Users\Erhan\Documents\0.YapayZekaKursu\Projects\PBL Level2\Hw.15.BecomeAPro\20.NLP_BootcampML\models\best_model.pkl
✅ Submission dosyası başarıyla oluşturuldu: C:\Users\Erhan\Documents\0.YapayZekaKursu\Projects\PBL Level2\Hw.15.BecomeAPro\20.NLP_BootcampML\outputs\submission.csv


Unnamed: 0,Id,airline_sentiment
0,6679,positive
1,7908,positive
2,625,negative
3,828,negative
4,1557,negative
