<a href="https://colab.research.google.com/github/gupta24789/hugging-face/blob/main/02_data_wrangling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from datasets import load_dataset

In [None]:
## Load dataset
dataset = load_dataset("csv", data_dir = "data", data_files = {"train": "train.csv", "test": "val.csv"})
dataset

DatasetDict({
    train: Dataset({
        features: ['raw_tweet', 'label'],
        num_rows: 8004
    })
    test: Dataset({
        features: ['raw_tweet', 'label'],
        num_rows: 2000
    })
})

In [None]:
## Convert data to pandas df
sample_df = dataset['train'].to_pandas().sample(6)
sample_df

Unnamed: 0,raw_tweet,label
7071,@MotorsportCntrl @GP2_Official so true! This w...,0.0
1407,#Friday's here!! Give us a call today for all...,1.0
1045,@ArianeBeeston Communal knowledge! :),1.0
724,@TisoyPeter follow @jnlazts &amp; http://t.co...,1.0
3116,I kinda wanna fangirl over not an apology agai...,1.0
4956,All is fair in love and war kapan update :(\n\...,0.0


## Visualize map functions

In [None]:
sample_data = dataset['train'].shuffle(seed = 42).select(range(10))
sample_data

Dataset({
    features: ['raw_tweet', 'label'],
    num_rows: 10
})

In [None]:
sample_data = sample_data.map(lambda x: print(x))

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

{'raw_tweet': 'im sending Alex like a million pictures :)', 'label': 1.0}
{'raw_tweet': '#TURKEY ARE NOW #BOMBING #ISIS IN #SYRIA,AND ALLOW THE #USA TO USE ONE OF IT #AIRFIELDS FOR THEIR #JETS :()', 'label': 0.0}
{'raw_tweet': 'No drop home for me today :(', 'label': 0.0}
{'raw_tweet': 'There are startup community in the tropics too! Geeks on the beach :) #startupPH https://t.co/Bg4SxKN3tg', 'label': 1.0}
{'raw_tweet': '@Humna__Khan Acha Thek :p', 'label': 1.0}
{'raw_tweet': "i walked out of the fruit shop with LOTS of pumpkins and a guy smiled at me because it looked pretty funny, but doesn't matter, it works :)", 'label': 1.0}
{'raw_tweet': 'the internet is being a total bitch : (', 'label': 0.0}
{'raw_tweet': '@mexeeN @FRSkyRRoZ spoile :)', 'label': 1.0}
{'raw_tweet': "My mom's a linguist. My dad's a computer scientist. And I am the dumbest one in the family :-(", 'label': 0.0}
{'raw_tweet': '@Uber no ice cream for me :-( #getthescoop http://t.co/NNEeBaoTVY', 'label': 0.0}


## Subset of data

In [None]:
sample_data = dataset['train'].shuffle(seed = 42).select(range(1000))
sample_data

Dataset({
    features: ['raw_tweet', 'label'],
    num_rows: 1000
})

## rename columns

In [None]:
sample_data = sample_data.rename_columns({"raw_tweet":"tweets"})
sample_data

Dataset({
    features: ['tweets', 'label'],
    num_rows: 1000
})

## lowercase text

In [None]:
def lowercase_text(example):
    return {"tweets": example["tweets"].lower()}

sample_data = sample_data.map(lowercase_text)
sample_data

Dataset({
    features: ['tweets', 'label'],
    num_rows: 1000
})

## create new column tweets_len

In [None]:
def create_tweets_len(row):
    return {"tweets_len": len(row['tweets'].split(" "))}


sample_data = sample_data.map(create_tweets_len)
sample_data

Dataset({
    features: ['tweets', 'label', 'tweets_len'],
    num_rows: 1000
})

In [None]:
## Another way to create new columns
sample_data.map(lambda x: {'new_tweets_len': len(x['tweets'].split(" "))})

Dataset({
    features: ['tweets', 'label', 'tweets_len', 'new_tweets_len'],
    num_rows: 1000
})

## filter data

In [None]:
subdata = sample_data.filter(lambda x: x['tweets_len']>=10)
subdata

Dataset({
    features: ['tweets', 'label', 'tweets_len'],
    num_rows: 540
})

In [None]:
## sort data
sample_data.sort('tweets_len').to_pandas().head(10)

Unnamed: 0,tweets,label,tweets_len
0,dudaftie...:-)\n\n#breaktym,1.0,1
1,*later :-),1.0,2
2,splendour :(,0.0,2
3,@harryperfx fback?:),1.0,2
4,over slept:):):),1.0,2
5,evening :(((,0.0,2
6,starving :-(,0.0,2
7,client_amends_edit_5_final_final_final.pdf\n\n...,1.0,2
8,goodnight :),1.0,2
9,crying :(\n#pdapaghimok,0.0,2


In [None]:
sample_data.sort('tweets_len', reverse=True).to_pandas().head(10)

Unnamed: 0,tweets,label,tweets_len
0,@latersbby honey &amp; brown sugar is only goo...,0.0,30
1,@drbabarawan u said th remedy ov pak prob z to...,0.0,30
2,@chruilo i have to add that to my information ...,1.0,30
3,the last dick pic i got was awful. it ruined w...,0.0,29
4,"ohh :( 50 people bc it was an orchestra camp, ...",0.0,29
5,i walked out of the fruit shop with lots of pu...,1.0,28
6,@claredolotina i love dogs moar naman haha i h...,1.0,28
7,@cloudljp there were a lot of writings of the ...,0.0,28
8,@jess_lakeland sorry to hear this jess :( the ...,0.0,28
9,@petitemistress do it! i want to start one for...,0.0,28


## create train, val, test split

In [None]:
drug_dataset_clean = dataset["train"].train_test_split(train_size=0.8, seed=42)
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['raw_tweet', 'label'],
        num_rows: 6403
    })
    test: Dataset({
        features: ['raw_tweet', 'label'],
        num_rows: 1601
    })
})

## Save to disk

Dataset.save_to_disk() => save in arrow format

Dataset.to_csv() - > save in csv format

Dataset.to_json() => save in json format

In [None]:
drug_dataset_clean.save_to_disk("hf-data")

Saving the dataset (0/1 shards):   0%|          | 0/6403 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1601 [00:00<?, ? examples/s]

## Load from disk

In [None]:
from datasets import load_from_disk

In [None]:
load_from_disk("hf-data")

DatasetDict({
    train: Dataset({
        features: ['raw_tweet', 'label'],
        num_rows: 6403
    })
    test: Dataset({
        features: ['raw_tweet', 'label'],
        num_rows: 1601
    })
})