# Erstellungsdaten Kanäle

In [None]:
import glob
import json
import os
import csv

path = os.path.join('../json_files')
file_list = glob.glob(os.path.join(path, '*.json'))
creation_dict = {}
for file in file_list:
    with open(f'{file}', 'r') as f:
        try:
            data = json.loads(f.read())
        except json.decoder.JSONDecodeError:
            pass
        for s in data:
            try:
                if s['id'] == 1:
                    creation_dict[s['action']['title']] = s['date']
            except KeyError:
                pass

# Convert the data to a list of dictionaries for CSV writing
csv_data = [{'Title': key, 'Date': value} for key, value in creation_dict.items()]

# Define the CSV file path
csv_file_path = 'creation_data.csv'

# Write the data to the CSV file
with open(csv_file_path, 'w', newline='') as csv_file:
    fieldnames = ['Title', 'Date']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    # Write the header
    writer.writeheader()

    # Write the data rows
    writer.writerows(csv_data)

print(f'Data has been saved to {csv_file_path}')


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import mpld3

# Load the CSV file into a Pandas DataFrame
csv_file = '../telegram_data.csv'  # Replace with the path to your .csv file
df = pd.read_csv(csv_file)

# Convert the 'date' column to a datetime object
df['date'] = pd.to_datetime(df['date'])

# Define the start date of the pandemic
pandemic_start_date = pd.Timestamp('2020-01-01', tz='UTC')

# Filter the DataFrame to include only channels created after the start of the pandemic
channels_created_after_pandemic = df[df['date'] >= pandemic_start_date]

# Group the data by the 'channel' column and count the number of texts in each channel
channel_text_counts = channels_created_after_pandemic['channel_id'].value_counts().reset_index()
channel_text_counts.columns = ['channel_id', 'text_count']

# Sort the data by text count
channel_text_counts = channel_text_counts.sort_values(by='text_count')

# Create a bar chart to visualize the number of texts for each channel
plt.figure(figsize=(12, 6))
plt.barh(channel_text_counts['channel_id'], channel_text_counts['text_count'], color='lightgreen')
plt.xlabel('Number of Texts')
plt.ylabel('Channel')
plt.title('Number of Texts in Each Channel (Created After 01.01.2020)') 
plt.gca().invert_yaxis()  # Invert the y-axis to show the channels with the most texts at the top
plt.tight_layout()

# Convert the Matplotlib plot to an interactive HTML plot
html_fig = mpld3.fig_to_html(plt.gcf())

# Save the HTML plot to a file
with open('bar_chart_created_after_pandemic.html', 'w') as html_file:
    html_file.write(html_fig)

# Count the number of channels created after the pandemic
num_channels_created_after_pandemic = channels_created_after_pandemic['channel_id'].nunique()
print(f"Number of channels created after 01.01.2020: {num_channels_created_after_pandemic}")


# Häufigkeit der Posts jedes Kanals

In [None]:
import pandas as pd
import plotly.express as px

# Load the CSV file into a Pandas DataFrame
csv_file = '../telegram_data.csv'  # Replace with the path to your .csv file
df = pd.read_csv(csv_file)

# Convert the 'date' column to a datetime object
df['date'] = pd.to_datetime(df['date'])


"""
# Define the start date of the pandemic
pandemic_start_date = pd.Timestamp('2020-01-01', tz='UTC')

# Filter the DataFrame to include only channels created after the start of the pandemic
dict_pandemic = {}
if df['date'] >= pandemic_start_date:
    dict_pandemic[df['channel_id']] = 1
print(dict_pandemic)
"""
# Group the data by the 'channel' column and count the number of texts in each channel
channel_text_counts = df['channel_id'].value_counts().reset_index()
channel_text_counts.columns = ['channel_id', 'text_count']

# Sort the data by text count
channel_text_counts = channel_text_counts.sort_values(by='text_count')

# Create an interactive bar chart using Plotly
fig = px.bar(channel_text_counts, x='text_count', y='channel_id', orientation='h', color='channel_id',
             labels={'text_count': 'Number of Texts', 'channel_id': 'Channel'},
             title='Number of Texts in Each Channel')
fig.update_xaxes(categoryorder='total ascending')  # Sort x-axis in ascending order

# Save the interactive plot to an HTML file
fig.write_html('interactive_bar_chart.html')

print(len(df['date']))
channel_dict = {}
for i in df['channel_id']:
    channel_dict[i] = 1
print(len(channel_dict))
print('interactive_bar_chart.html')

# Daten der Publikation - Wann wird viel publiziert

In [None]:
import plotly.express as px
import pandas as pd

# Load the CSV file into a Pandas DataFrame
word = 'medien'
if word == 'medien':
    csv_file = '../subcorpus_tp1.csv'  # Replace with the path to your .csv file
    df = pd.read_csv(csv_file, sep='\t', low_memory=False)
    # Convert the 'date' column to a datetime object
    df['text_date'] = pd.to_datetime(df['text_date'], format='ISO8601')

    # Group the data by month and count the number of texts published each month
    monthly_text_counts = df.groupby(df['text_date'].dt.to_period('M'))['text_content'].count().reset_index()
    monthly_text_counts.columns = ['Month', 'Text Count']

    # Convert 'Month' to string format for the x-axis labels
    monthly_text_counts['Month'] = monthly_text_counts['Month'].dt.strftime('%Y-%m')

    # Create an interactive bar chart using Plotly
    fig = px.bar(monthly_text_counts, x='Month', y='Text Count', labels={'Month': 'Month', 'Text Count': 'Number of Texts'},
                title='Number of Texts Published Over Time (Grouped by Month)')
    fig.update_traces(marker_color='black')

    # Save the interactive plot to an HTML file
    fig.write_html(f'{word}_publikationsdaten.html')

    # Print the DataFrame with monthly text counts
    print(monthly_text_counts)


elif word == 'telegram':
    csv_file = '../telegram_data.csv'  # Replace with the path to your .csv file

    df = pd.read_csv(csv_file, sep=',', low_memory=False)
    # Convert the 'date' column to a datetime object
    df['date'] = pd.to_datetime(df['date'], format='mixed')

    # Group the data by date and count the number of texts published each day
    daily_text_counts = df.groupby(df['date'].dt.date)['channel_id'].count().reset_index()
    daily_text_counts.columns = ['Date', 'Text Count']

    # Create an interactive bar chart using Plotly
    fig = px.bar(daily_text_counts, x='Date', y='Text Count', labels={'Date': 'Date', 'Text Count': 'Number of Texts'},
                title='Number of Texts Published Over Time')
    fig.update_traces(marker_color='black')
    # Save the interactive plot to an HTML file
    fig.write_html(f'{word}_publikationsdaten.html')

    # Print the DataFrame with daily text counts
    print(daily_text_counts)

# Publikationsdatum und Erstellungsdatum

In [None]:
import csv
from datetime import datetime
import plotly.graph_objects as go
import pandas as pd
import glob
import json
import os

# Function to read CSV data
def read_csv_data(file_path, date_column):
    try:
        df = pd.read_csv(file_path)
        df[date_column] = pd.to_datetime(df[date_column])
        return df
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except Exception as e:
        print(f"Error reading CSV: {e}")
        return None

# Calculate moving average
def calculate_moving_average(data, window_size):
    return data.rolling(window=window_size).mean()

# Load the CSV file with the data about texts and channels
texts_csv_file_path = '/Users/dventr/Desktop/telegram_daten/telegram_data.csv'
channels_csv_file_path = '/Users/dventr/Desktop/telegram_daten/subcorpus/creation_data.csv'

# Read text data
texts_df = read_csv_data(texts_csv_file_path, 'date')

if texts_df is not None:
    # Group the text data by date and count the number of texts published each day
    daily_text_counts = texts_df.groupby(texts_df['date'].dt.date)['channel_id'].count().reset_index()
    daily_text_counts.columns = ['Date', 'Text Count']

    # Calculate the moving average for text counts
    window_size = 7  # You can adjust the window size as needed
    daily_text_counts['Smoothed Text Count'] = calculate_moving_average(daily_text_counts['Text Count'], window_size)

    # Create a line chart for text counts over time with the smoothed version
    text_fig = go.Figure()
    text_fig.add_trace(go.Scatter(x=daily_text_counts['Date'], y=daily_text_counts['Text Count'], mode='lines', name='Text Count'))
    text_fig.add_trace(go.Scatter(x=daily_text_counts['Date'], y=daily_text_counts['Smoothed Text Count'], mode='lines', name='Smoothed Text Count', line=dict(dash='dash')))

# Read channel creation data
channels_df = read_csv_data(channels_csv_file_path, 'Date')

if channels_df is not None:
    # Group the channel creation data by date and count the number of channels created each day
    daily_channel_counts = channels_df.groupby(channels_df['Date'].dt.date).size().cumsum().reset_index()
    daily_channel_counts.columns = ['Date', 'Cumulative Channel Count']

    # Create a line chart for cumulative channel counts over time
    channel_fig = go.Figure()
    channel_fig.add_trace(go.Scatter(x=daily_channel_counts['Date'], y=daily_channel_counts['Cumulative Channel Count'], mode='lines', name='Cumulative Channel Count'))

    # Create subplots
    fig = go.Figure()

    if texts_df is not None:
        # Add the first subplot (frequency of posts and smoothed version)
        fig.add_trace(go.Scatter(x=daily_text_counts['Date'], y=daily_text_counts['Text Count'], mode='lines', name='Text Count'))
        fig.add_trace(go.Scatter(x=daily_text_counts['Date'], y=daily_text_counts['Smoothed Text Count'], mode='lines', name='Smoothed Text Count', line=dict(dash='dash')))

    # Add the second subplot (cumulative channel counts)
    fig.add_trace(go.Scatter(x=daily_channel_counts['Date'], y=daily_channel_counts['Cumulative Channel Count'], mode='lines', name='Cumulative Channel Count'))

    # Update layout for the combined figure
    fig.update_layout(
        title='Text Counts and Cumulative Channel Counts Over Time',
        xaxis_title='Date',
        yaxis_title='Count',
        xaxis=dict(showline=True, showgrid=False),
        yaxis=dict(showline=True, showgrid=False)
    )

    # Save the interactive plot to an HTML file
    fig.write_html('combined_plot.html', auto_open=True)

    print("Data has been saved to combined_plot.html. Open it in your web browser to view the plot.")
