<a href="https://colab.research.google.com/github/damlakaynarca/Big-Data/blob/main/Spark_En_%C4%B0yi_Sonu%C3%A7_Tweet_Eval_Veri_Seti_%C3%9Czerinde_%C3%87al%C4%B1%C5%9Fma_Big_Data_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Gerekli Kütüphaneleri Yükleme
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz  # Spark 3.2.0 kullanacağız
!tar -xvf spark-3.2.0-bin-hadoop3.2.tgz > /dev/null  # Dosyayı açıyoruz
!pip install -q findspark datasets

# JAVA ve SPARK ortamını ayarlama
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.0-bin-hadoop3.2"  # Spark 3.2.0 konumunu kullanıyoruz

# Spark ve Findspark'ı Başlat
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, udf
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Spark Oturumu Başlat
spark = SparkSession.builder \
    .appName("TweetEval Sentiment Analysis") \
    .getOrCreate()

# Hugging Face'ten Dataset Yükleme
from datasets import load_dataset
import pandas as pd

# 'tweet_eval' veri setini indirme
data = load_dataset("tweet_eval", "emotion")
train_data = pd.DataFrame(data['train'])
train_data.to_csv("tweet_eval_emotion.csv", index=False)

# Spark DataFrame'e Yükleme
df = spark.read.csv("tweet_eval_emotion.csv", header=True, inferSchema=True)
df = df.select(col("text").alias("tweet"), col("label").cast("integer"))  # Sütunları yeniden adlandırma

# Veri Ön İşleme
df = df.withColumn("tweet", lower(col("tweet")))  # Küçük harfe çevirme
df = df.withColumn("tweet", regexp_replace(col("tweet"), "[^a-zA-Z\\s]", ""))  # Özel karakterleri temizleme

# Dağıtılmış Fonksiyon: Tweet Uzunluğunu Hesaplama
def tweet_length(tweet):
    return len(tweet.split())

# UDF Tanımlama
tweet_length_udf = udf(tweet_length, IntegerType())

# UDF ile Yeni Sütun Ekleme
df = df.withColumn("tweet_length", tweet_length_udf(col("tweet")))

# Spark Shuffle Partition ayarı (Hız için)
spark.conf.set("spark.sql.shuffle.partitions", "4")

# Tokenization, Stopwords, TF-IDF ve Özellik Ölçeklendirme
tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
hashing_tf = HashingTF(inputCol="filtered_words", outputCol="rawFeatures", numFeatures=20000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)

# Özellikleri Ölçeklendirme
scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=False)

# Logistic Regression Model
lr = LogisticRegression(featuresCol="scaled_features", labelCol="label")

# Pipeline
pipeline = Pipeline(stages=[tokenizer, stopwords_remover, hashing_tf, idf, scaler, lr])

# Modeli Eğitme
model = pipeline.fit(df)
predictions = model.transform(df)

# Performans Değerlendirme
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Logistic Regression Doğruluğu: {accuracy:.2f}")

# En İyi Tahminleri Gösterme
predictions.select("tweet", "label", "prediction").show(5)

# Dağıtılmış Tweet Uzunluğu Görüntüleme
df.select("tweet", "tweet_length").show(10)

# Spark oturumunu kapatma
spark.stop()


Logistic Regression Doğruluğu: 0.99
+--------------------+-----+----------+
|               tweet|label|prediction|
+--------------------+-----+----------+
|worry is a down p...|    2|       2.0|
|my roommate its o...|    0|       0.0|
|no but thats so c...|    1|       1.0|
|rooneys fucking u...|    0|       0.0|
|its pretty depres...|    3|       3.0|
+--------------------+-----+----------+
only showing top 5 rows

+--------------------+------------+
|               tweet|tweet_length|
+--------------------+------------+
|worry is a down p...|          17|
|my roommate its o...|          14|
|no but thats so c...|          18|
|rooneys fucking u...|          13|
|its pretty depres...|          11|
|user but your pus...|          24|
|making that yearl...|          16|
|tiller and breezy...|          14|
|user broadband is...|          10|
|user look at thos...|           6|
+--------------------+------------+
only showing top 10 rows

