In [1]:
import kagglehub
from tqdm import tqdm
from pathlib import Path
import pandas as pd
from transformers import pipeline
from datasets import Dataset

In [2]:
# Download latest version
path = kagglehub.dataset_download("kashishparmar02/social-media-sentiments-analysis-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/kashishparmar02/social-media-sentiments-analysis-dataset/versions/3


In [3]:
df = pd.read_csv(Path(path) / "sentimentdataset.csv")
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,2,2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19


In [4]:
df = df.drop(columns=["Unnamed: 0", "Unnamed: 0.1"])
df.head()

Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19


In [5]:
# Create the pipeline
pipe = pipeline("translation", model="facebook/nllb-200-3.3B", device=0)  # device=0 for first GPU

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


In [None]:
dataset = Dataset.from_pandas(df)

# Define a function to translate in batches
def translate_batch(examples):
    translations = pipe(
        examples["Text"], 
        src_lang="eng_Latn", 
        tgt_lang="mya_Mymr",
        batch_size=64
    )
    return {
        "Text": examples["Text"],
        "Text_MM": [t["translation_text"] for t in translations],
        "Sentiment": examples["Sentiment"],
        "Timestamp": examples["Timestamp"],
        "User": examples["User"],
        "Platform": examples["Platform"],
        "Hashtags": examples["Hashtags"],
        "Retweets": examples["Retweets"],
        "Likes": examples["Likes"],
        "Country": examples["Country"],
        "Year": examples["Year"],
        "Month": examples["Month"],
        "Day": examples["Day"],
        "Hour": examples["Hour"]
    }

In [None]:
# Apply the translation function to the entire dataset
# This will process in batches automatically
translated_dataset = dataset.map(
    translate_batch,
    batched=True,
    batch_size=64,  # You can adjust this batch size
    remove_columns=dataset.column_names
)



Map:   0%|          | 0/732 [00:00<?, ? examples/s]

In [28]:
ordered_columns = ["Text", "Text_MM", "Sentiment", "Timestamp", "User", 
                  "Platform", "Hashtags", "Retweets", "Likes", 
                  "Country", "Year", "Month", "Day", "Hour"]

df = translated_dataset.to_pandas()
# Filter to only include columns that actually exist
ordered_columns = [col for col in ordered_columns if col in df.columns]
df = df[ordered_columns]

In [29]:
reordered_dataset = Dataset.from_pandas(df)

In [30]:
reordered_dataset

Dataset({
    features: ['Text', 'Text_MM', 'Sentiment', 'Timestamp', 'User', 'Platform', 'Hashtags', 'Retweets', 'Likes', 'Country', 'Year', 'Month', 'Day', 'Hour'],
    num_rows: 732
})

In [31]:
reordered_dataset.push_to_hub("chuuhtetnaing/myanmar-social-media-sentiment-analysis-dataset")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/chuuhtetnaing/myanmar-social-media-sentiment-analysis-dataset/commit/6c00516d603b1822b4c1fcc7fd20e042b727e645', commit_message='Upload dataset', commit_description='', oid='6c00516d603b1822b4c1fcc7fd20e042b727e645', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/chuuhtetnaing/myanmar-social-media-sentiment-analysis-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='chuuhtetnaing/myanmar-social-media-sentiment-analysis-dataset'), pr_revision=None, pr_num=None)