**Twitter Sentiment Analysis with/without Word Embeddings**


In [None]:
!pip install kaggle




In [None]:
from google.colab import files
files.upload()  # kaggle.json dosyasını seçip yükle


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"senazici","key":"5c2cc02fb94a604e807cbfe20258310e"}'}

In [None]:
import os
os.makedirs('/root/.kaggle', exist_ok=True)
os.rename("kaggle.json", "/root/.kaggle/kaggle.json")
os.chmod("/root/.kaggle/kaggle.json", 600)


In [None]:
!kaggle datasets download -d kazanova/sentiment140


Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
import zipfile
with zipfile.ZipFile("sentiment140.zip", 'r') as zip_ref:
    zip_ref.extractall("sentiment140")


In [None]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [None]:
import pandas as pd

df = pd.read_csv("sentiment140/training.1600000.processed.noemoticon.csv", encoding='latin-1', header=None)
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']

df[['target', 'text']].head()


Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [None]:
df = df[df['target'] != 2]  # varsa nötrleri çıkar
df['target'] = df['target'].replace({0: 0, 4: 1})  # 0: negatif, 1: pozitif


**Veri Temizleme (Preprocessing)**

In [None]:
def temizle(text):
    text = re.sub(r'http\S+', '', text)  # linkleri sil
    text = re.sub(r'@\w+', '', text)     # mentionları sil
    text = re.sub(r'#\w+', '', text)     # hashtagleri sil
    text = re.sub(r'[^\w\s]', '', text)  # noktalama işaretlerini sil
    text = text.lower()                  # küçük harfe çevir
    return text

df['clean_text'] = df['text'].apply(temizle)


**Eğitim ve Test Seti Ayırma**


In [None]:
X = df['clean_text']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


**TF-IDF Vektörleştirme (Embedding’siz çözüm)**

---



In [None]:
vectorizer = TfidfVectorizer(max_features=5000)  # en sık geçen 5000 kelime
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


**Model Eğitimi (Logistic Regression)**

In [None]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


**Tahmin ve Değerlendirme**

In [None]:
y_pred = model.predict(X_test_tfidf)

print("Doğruluk:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Doğruluk: 0.78985
Confusion Matrix:
 [[123794  35700]
 [ 31548 128958]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.78      0.79    159494
           1       0.78      0.80      0.79    160506

    accuracy                           0.79    320000
   macro avg       0.79      0.79      0.79    320000
weighted avg       0.79      0.79      0.79    320000



In [None]:


!pip install gensim




In [None]:
import warnings
warnings.filterwarnings("ignore")
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import re
def simple_tokenizer(text):
    return re.findall(r'\b\w+\b', text.lower())

df['tokens'] = df['clean_text'].apply(simple_tokenizer)


**Word2Vec modeli eğit**

In [None]:
!pip install gensim
import gensim
from gensim.models import Word2Vec





In [None]:
# tokens sütununu liste listesine çeviriyoruz
sentences = df['tokens'].tolist()

# Word2Vec modelini eğitiyoruz
w2v_model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=2, workers=4)


In [None]:
import numpy as np

def get_avg_vector(tokens, model, vector_size):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    else:
        return np.mean(vectors, axis=0)

vector_size = 100
df['vector'] = df['tokens'].apply(lambda x: get_avg_vector(x, w2v_model, vector_size))


**Sınıflandırma Modeli Kur (örneğin Logistic Regression)**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = list(df['vector'])
y = df['target']  # ya da df['label'] hangisiyse

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.73      0.74      0.74    159494
           1       0.74      0.74      0.74    160506

    accuracy                           0.74    320000
   macro avg       0.74      0.74      0.74    320000
weighted avg       0.74      0.74      0.74    320000



In [None]:
from gensim.models import Word2Vec

# Word2Vec modelinden alınan vektörler
model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Vektörleri kaydet
words = list(model.wv.index_to_key)
vectors = model.wv[words]

# Vektörleri bir dosyaya kaydedelim (word2vec formatında)
with open('vectors.tsv', 'w') as f:
    for word, vector in zip(words, vectors):
        f.write(f"{word}\t" + "\t".join(map(str, vector)) + "\n")


In [None]:
from google.colab import files
files.download('vectors.tsv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Metadata dosyasını oluştur
with open('metadata.tsv', 'w') as f:
    for word in words:
        f.write(f"{word}\n")


In [None]:
from google.colab import files

files.download('metadata.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>