# Twitter Data Extraction

In [None]:
import pandas as pd
from sklearn.utils import resample
import os
import zipfile

# 1. Download dataset
os.system("kaggle datasets download -d thoughtvector/customer-support-on-twitter")

# 2. Extract ZIP safely
zip_path = "customer-support-on-twitter.zip"
extract_path = "extracted_data"

os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 3. Find CSV file dynamically
csv_file = None
for root, dirs, files in os.walk(extract_path):
    for file in files:
        if file.endswith(".csv"):
            csv_file = os.path.join(root, file)
            break

if csv_file is None:
    raise FileNotFoundError("CSV file not found after extraction")

print("Found CSV:", csv_file)

# 4. Load dataset
df = pd.read_csv(csv_file)
print("Dataset loaded:", df.shape)

# 5. Target column
target_col = "inbound"

# 6. Separate classes
df_true = df[df[target_col] == True]
df_false = df[df[target_col] == False]

# 7. Balanced sampling (50k each)
df_true_sample = resample(df_true, n_samples=50000, random_state=42)
df_false_sample = resample(df_false, n_samples=50000, random_state=42)

# 8. Combine & shuffle
balanced_df = pd.concat([df_true_sample, df_false_sample]) \
                 .sample(frac=1, random_state=42)


balanced_df.to_csv("twitter_support_balanced_1L.csv", index=False)

print("DONE: twitter_support_balanced_1L.csv created")

# For Model Validation - 25k

In [None]:
import pandas as pd
from sklearn.utils import resample
import os
import zipfile

# 1. Download dataset
os.system("kaggle datasets download -d thoughtvector/customer-support-on-twitter")

# 2. Extract ZIP safely
zip_path = "customer-support-on-twitter.zip"
extract_path = "extracted_data"

os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 3. Find CSV file dynamically
csv_file = None
for root, dirs, files in os.walk(extract_path):
    for file in files:
        if file.endswith(".csv"):
            csv_file = os.path.join(root, file)
            break

if csv_file is None:
    raise FileNotFoundError("CSV file not found after extraction")

print("Found CSV:", csv_file)

# 4. Load dataset
df = pd.read_csv(csv_file)
print("Dataset loaded:", df.shape)

# 5. Target column
target_col = "inbound"

# 6. Separate classes
df_true = df[df[target_col] == True]
df_false = df[df[target_col] == False]

# 7. Balanced sampling (50k each)
df_true_sample = resample(df_true, n_samples=12500, random_state=42)
df_false_sample = resample(df_false, n_samples=12500, random_state=42)

# 8. Combine & shuffle
balanced_df = pd.concat([df_true_sample, df_false_sample]) \
                 .sample(frac=1, random_state=42)


balanced_df.to_csv("twitter_validation_25k.csv", index=False)

print("DONE: twitter_validation_25k.csv created")