In [1]:
# İlk hücre
import os
import io
import yaml
import pandas as pd
from hdfs import InsecureClient
import pyarrow as pa
import pyarrow.parquet as pq

# İkinci hücre
# HDFS bağlantısını yapılandır
hdfs_client = InsecureClient('http://hdfs-namenode:9870', user='root')

# HDFS'de dizin oluştur
base_path = '/climatewatch'
subdirs = ['raw', 'processed', 'models', 'temp']

for subdir in subdirs:
    path = f'{base_path}/{subdir}'
    try:
        hdfs_client.makedirs(path)
        print(f'Created directory: {path}')
    except Exception as e:
        print(f'Directory already exists or error: {path} - {str(e)}')

# Üçüncü hücre
# Örnek veri yükleme
try:
    df = pd.read_csv('/home/jovyan/work/scraped_articles_with_sentiment.csv')
    print(f'Loaded {len(df)} rows from CSV')
except Exception as e:
    print(f'Error loading CSV: {str(e)}')
    # Örnek veri oluştur
    df = pd.DataFrame({
        'title': ['Sample Article 1', 'Sample Article 2'],
        'content': ['Climate change effects', 'Environmental impact'],
        'sentiment': [0.5, -0.2]
    })

# Dördüncü hücre
# DataFrame'i Parquet formatında HDFS'e yaz
hdfs_path = f'{base_path}/raw/articles.parquet'

# DataFrame'i Parquet formatına dönüştür
table = pa.Table.from_pandas(df)
buf = io.BytesIO()
pq.write_table(table, buf)
buf.seek(0)

# HDFS'e yaz
with hdfs_client.write(hdfs_path, overwrite=True) as writer:
    writer.write(buf.getvalue())

print(f'Written data to HDFS: {hdfs_path}')

# Beşinci hücre
# HDFS'den veriyi oku
with hdfs_client.read(hdfs_path) as reader:
    buf = io.BytesIO(reader.read())
    buf.seek(0)
    table = pq.read_table(buf)
    df_from_hdfs = table.to_pandas()

print(f'Read {len(df_from_hdfs)} rows from HDFS')
df_from_hdfs.head()

# Altıncı hücre
# HDFS dizin içeriğini listele
contents = hdfs_client.list(base_path)
print(f'\nContents of {base_path}:')
for item in contents:
    print(f'- {item}')

Created directory: /climatewatch/raw
Created directory: /climatewatch/processed
Created directory: /climatewatch/models
Created directory: /climatewatch/temp
Loaded 10 rows from CSV
Written data to HDFS: /climatewatch/raw/articles.parquet
Read 10 rows from HDFS

Contents of /climatewatch:
- models
- processed
- raw
- temp
