**Twitter Sentiment Analysis with/without Word Embeddings**


In [1]:
!pip install kaggle




In [2]:
from google.colab import files
files.upload()  # kaggle.json dosyasını seçip yükle


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"senazici","key":"5c2cc02fb94a604e807cbfe20258310e"}'}

In [3]:
import os
os.makedirs('/root/.kaggle', exist_ok=True)
os.rename("kaggle.json", "/root/.kaggle/kaggle.json")
os.chmod("/root/.kaggle/kaggle.json", 600)


In [4]:
!kaggle datasets download -d kazanova/sentiment140


Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other


In [5]:
import zipfile
with zipfile.ZipFile("sentiment140.zip", 'r') as zip_ref:
    zip_ref.extractall("sentiment140")


In [6]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [7]:
import pandas as pd

df = pd.read_csv("sentiment140/training.1600000.processed.noemoticon.csv", encoding='latin-1', header=None)
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']

df[['target', 'text']].head()


Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [8]:
df = df[df['target'] != 2]  # varsa nötrleri çıkar
df['target'] = df['target'].replace({0: 0, 4: 1})  # 0: negatif, 1: pozitif


**Veri Temizleme (Preprocessing)**

In [9]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

def gelismis_temizle(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    tokens = text.split()

    tokens = [word for word in tokens if word not in stop_words]

    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

df['clean_text'] = df['text'].apply(gelismis_temizle)

print("Temizlenmiş metin örnekleri:")
print(df[['text', 'clean_text']].head())



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Temizlenmiş metin örnekleri:
                                                text  \
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...   
1  is upset that he can't update his Facebook by ...   
2  @Kenichan I dived many times for the ball. Man...   
3    my whole body feels itchy and like its on fire    
4  @nationwideclass no, it's not behaving at all....   

                                          clean_text  
0      thats bummer shoulda got david carr third day  
1  upset cant update facebook texting might cry r...  
2    dived many time ball managed save rest go bound  
3                    whole body feel itchy like fire  
4                           behaving im mad cant see  


**Eğitim ve Test Seti Ayırma**


In [10]:
X = df['clean_text']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Eğitim seti boyutu: {len(X_train)}")
print(f"Test seti boyutu: {len(X_test)}")
print("\nEğitim setindeki hedef dağılımı:")
print(y_train.value_counts(normalize=True)) # normalize=True yüzdelik dağılımı gösterir
print("\nTest setindeki hedef dağılımı:")
print(y_test.value_counts(normalize=True)) # normalize=True yüzdelik dağılımı gösterir


Eğitim seti boyutu: 1280000
Test seti boyutu: 320000

Eğitim setindeki hedef dağılımı:
target
0    0.500395
1    0.499605
Name: proportion, dtype: float64

Test setindeki hedef dağılımı:
target
1    0.501581
0    0.498419
Name: proportion, dtype: float64


**TF-IDF Vektörleştirme (Embedding’siz çözüm)**

---



In [11]:

from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))

print("TF-IDF vektörleştirici eğitiliyor ve metinler dönüştürülüyor...")


X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("TF-IDF vektörleştirme tamamlandı.")
print(f"Vektörleştirilmiş eğitim verisi boyutu: {X_train_tfidf.shape}")
print(f"Vektörleştirilmiş test verisi boyutu: {X_test_tfidf.shape}")
print(f"Öğrenilen kelime dağarcığı boyutu: {len(tfidf_vectorizer.vocabulary_)}")




TF-IDF vektörleştirici eğitiliyor ve metinler dönüştürülüyor...
TF-IDF vektörleştirme tamamlandı.
Vektörleştirilmiş eğitim verisi boyutu: (1280000, 10000)
Vektörleştirilmiş test verisi boyutu: (320000, 10000)
Öğrenilen kelime dağarcığı boyutu: 10000


**Model Eğitimi (Logistic Regression)**

In [12]:

from sklearn.linear_model import LogisticRegression


tfidf_lr_model = LogisticRegression(max_iter=1000, random_state=42)

print("TF-IDF ile Logistic Regression modeli eğitiliyor...")


tfidf_lr_model.fit(X_train_tfidf, y_train)

print("TF-IDF ile Logistic Regression modeli eğitimi tamamlandı.")





TF-IDF ile Logistic Regression modeli eğitiliyor...
TF-IDF ile Logistic Regression modeli eğitimi tamamlandı.


**Tahmin ve Değerlendirme**

In [13]:

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


y_pred_tfidf_lr = tfidf_lr_model.predict(X_test_tfidf)


print("\n--- TF-IDF + Logistic Regression Model Değerlendirme Sonuçları ---")


accuracy = accuracy_score(y_test, y_pred_tfidf_lr)
print(f"Doğruluk (Accuracy): {accuracy:.4f}") # Sonucu virgülden sonra 4 basamakla yazdırma

conf_matrix = confusion_matrix(y_test, y_pred_tfidf_lr)
print("\nConfusion Matrix:")
print(conf_matrix)


class_report = classification_report(y_test, y_pred_tfidf_lr)
print("\nClassification Report:")
print(class_report)







--- TF-IDF + Logistic Regression Model Değerlendirme Sonuçları ---
Doğruluk (Accuracy): 0.7843

Confusion Matrix:
[[121387  38107]
 [ 30910 129596]]

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.76      0.78    159494
           1       0.77      0.81      0.79    160506

    accuracy                           0.78    320000
   macro avg       0.78      0.78      0.78    320000
weighted avg       0.78      0.78      0.78    320000



In [None]:
!pip install  gensim


In [29]:
import gensim
import spacy
import numpy
import thinc

print("Her şey başarıyla kuruldu.")


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
!pip uninstall -y numpy gensim


In [None]:
!pip install numpy==1.24.4
!pip install gensim==4.3.2


In [None]:
import warnings
warnings.filterwarnings("ignore")
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')


In [None]:
import re
def simple_tokenizer(text):
    return re.findall(r'\b\w+\b', text.lower())

df['tokens'] = df['clean_text'].apply(simple_tokenizer)


**Word2Vec modeli eğit**

In [None]:
!pip install gensim
import gensim
from gensim.models import Word2Vec



In [None]:
# tokens sütununu liste listesine çeviriyoruz
sentences = df['tokens'].tolist()

# Word2Vec modelini eğitiyoruz
w2v_model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=2, workers=4)


In [None]:
import numpy as np

def get_avg_vector(tokens, model, vector_size):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    else:
        return np.mean(vectors, axis=0)

vector_size = 100
df['vector'] = df['tokens'].apply(lambda x: get_avg_vector(x, w2v_model, vector_size))


**Sınıflandırma Modeli Kur (örneğin Logistic Regression)**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = list(df['vector'])
y = df['target']  # ya da df['label'] hangisiyse

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


In [None]:
from gensim.models import Word2Vec

# Word2Vec modelinden alınan vektörler
model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Vektörleri kaydet
words = list(model.wv.index_to_key)
vectors = model.wv[words]

# Vektörleri bir dosyaya kaydedelim (word2vec formatında)
with open('vectors.tsv', 'w') as f:
    for word, vector in zip(words, vectors):
        f.write(f"{word}\t" + "\t".join(map(str, vector)) + "\n")


In [None]:
from google.colab import files
files.download('vectors.tsv')


In [None]:
# Metadata dosyasını oluştur
with open('metadata.tsv', 'w') as f:
    for word in words:
        f.write(f"{word}\n")


In [None]:
from google.colab import files

files.download('metadata.tsv')