In [5]:
# İlk hücre
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Spark oturumu oluştur
spark = SparkSession.builder \
    .appName('ClimateWatch Data Processing') \
    .config('spark.driver.memory', '2g') \
    .config('spark.executor.memory', '2g') \
    .getOrCreate()

print('Spark session created successfully!')

# İkinci hücre
# HDFS'den Parquet dosyasını oku
hdfs_path = 'hdfs://hdfs-namenode:8020/climatewatch/raw/articles.parquet'
df = spark.read.parquet(hdfs_path)

print('Schema:')
df.printSchema()

print('\nSample data:')
df.show(5, truncate=False)

# Üçüncü hücre
# Sentiment analizi sonuçlarını grupla
sentiment_analysis = df.select(
    when(col('sentiment') > 0, 'Positive')
    .when(col('sentiment') < 0, 'Negative')
    .otherwise('Neutral')
    .alias('sentiment_category')
)

sentiment_counts = sentiment_analysis.groupBy('sentiment_category').count()
sentiment_counts.show()

# Dördüncü hücre
# Sonuçları HDFS'e kaydet
output_path = 'hdfs://hdfs-namenode:8020/climatewatch/processed/sentiment_analysis.parquet'
sentiment_counts.write.mode('overwrite').parquet(output_path)
print(f'Results saved to: {output_path}')

# Metin içeriğinde kelime sayımı
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.sql.functions import explode, lower, regexp_replace, col, length, when, lit, round, sum, desc

# Null değerleri ve boş metinleri kontrol et
cleaned_df = df.select(
    when(
        col('text').isNull() | (length(col('text')) == 0),
        lit('no_text')  # Null veya boş metin yerine 'no_text' kullan
    ).otherwise(
        regexp_replace(lower(col('text')), '[^a-zA-Z\\s]', ' ')
    ).alias('cleaned_text')
)

# Metni kelimelere ayır
tokenizer = Tokenizer(inputCol='cleaned_text', outputCol='words')
words_df = tokenizer.transform(cleaned_df)

# Stop words'leri kaldır
remover = StopWordsRemover(inputCol='words', outputCol='filtered_words')
filtered_df = remover.transform(words_df)

# Kelime sayımı
word_counts = filtered_df.select(explode(col('filtered_words')).alias('word')) \
    .filter(length('word') > 3) \
    .filter(col('word') != 'notext') \
    .groupBy('word') \
    .count() \
    .orderBy(desc('count'))

print('En sık kullanılan kelimeler:')
word_counts.show(20)

# Sonuçları HDFS'e kaydet
word_counts_path = 'hdfs://hdfs-namenode:8020/climatewatch/processed/word_counts.parquet'
word_counts.write.mode('overwrite').parquet(word_counts_path)
print(f'Kelime sayımı sonuçları kaydedildi: {word_counts_path}')

# Sentiment skorlarının dağılımını analiz et
sentiment_distribution = df.select(
    when(col('sentiment').isNull(), lit(0.0))
    .otherwise(round(col('sentiment'), 2))
    .alias('sentiment_score')
).groupBy('sentiment_score').count().orderBy('sentiment_score')

print('\nSentiment skorlarının dağılımı:')
sentiment_distribution.show(20)

# Sonuçları HDFS'e kaydet
distribution_path = 'hdfs://hdfs-namenode:8020/climatewatch/processed/sentiment_distribution.parquet'
sentiment_distribution.write.mode('overwrite').parquet(distribution_path)
print(f'Sentiment dağılımı kaydedildi: {distribution_path}')

# Veri kalitesi raporu
print('\nVeri kalitesi raporu:')
null_counts = df.select([
    sum(col(c).isNull().cast('int')).alias(f'{c}_null_count')
    for c in df.columns
])
null_counts.show()

# Metin uzunluğu analizi
text_length_analysis = df.select(
    when(col('text').isNull(), 0)
    .otherwise(length(col('text')))
    .alias('text_length')
)

print('\nMetin uzunluğu istatistikleri:')
text_length_analysis.describe().show()

# Metin uzunluğu dağılımı
text_length_bins = text_length_analysis.select(
    when(col('text_length') == 0, 'empty')
    .when(col('text_length') < 100, 'very_short')
    .when(col('text_length') < 500, 'short')
    .when(col('text_length') < 1000, 'medium')
    .when(col('text_length') < 5000, 'long')
    .otherwise('very_long')
    .alias('length_category')
)

print('\nMetin uzunluğu dağılımı:')
text_length_bins.groupBy('length_category').count().orderBy('length_category').show()

Spark session created successfully!
Schema:
root
 |-- url: string (nullable = true)
 |-- text: string (nullable = true)
 |-- sentiment: double (nullable = true)


Sample data:
+------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------