In [1]:
import pandas as pd
from datetime import datetime
import re

In [2]:
def parse_tweets(file_path):
    tweets = []
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    tweet_blocks = content.split('\nTimestamp: ')
    for block in tweet_blocks:
        if not block.strip():
            continue
            
        try:
            timestamp_match = re.search(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})', block)
            timestamp = datetime.strptime(timestamp_match.group(1), '%Y-%m-%d %H:%M:%S') if timestamp_match else None
            username_match = re.search(r'Text: (.*?)\n@([^\n]+)', block)
            username = username_match.group(1).strip() if username_match else None
            handle = username_match.group(2).strip() if username_match else None
            date_match = re.search(r'Feb \d+, 2023', block)
            tweet_date = date_match.group(0) if date_match else None
            content_match = re.search(r'Feb \d+, 2023\n(.*?)(?=\d+\n\d+|$)', block, re.DOTALL)
            content = content_match.group(1).strip() if content_match else None
            
            tweets.append({
                'timestamp': timestamp,
                'username': username,
                'handle': handle,
                'tweet_date': tweet_date,
                'content': content,
               
            })
            
        except Exception as e:
            print(f"Error parsing tweet block: {e}")
            continue
    
    df = pd.DataFrame(tweets)
    
    return df

In [7]:
df1 = parse_tweets('tweets3.txt')
df2 = parse_tweets('tweets4.txt')
df3 = parse_tweets('tweets5.txt')
df4 = parse_tweets('tweets6.txt')
df5 = parse_tweets('tweets7.txt')

In [11]:
combined_df = pd.concat([df1, df2, df3, df4, df5], axis=0)
combined_df = combined_df.reset_index(drop=True)
combined_df = combined_df.drop_duplicates()

In [13]:
combined_df.to_csv('tweets.csv', index=False)