# Transforming text data

In [1]:
import polars as pl

In [2]:
df = pl.DataFrame(
    {
        "publication": [
            "The Daily Deception",
            "Faux News Network",
            "The Fabricator",
            "The Misleader",
            "The Hoax Herald",
        ],
        "date": [
            "2022-01-01",
            "2022-01-03",
            "2022-01-04",
            "2022-01-05",
            "2022-01-06",
        ],
        "title": [
            "Scientists Discover New Species of Flying Elephant",
            "Aliens Land on Earth and Offer to Solve All Our Problems",
            "Study Shows That Eating Pizza Every Day Leads to Longer Life",
            "New Study Finds That Smoking is Good for You",
            "World's Largest Iceberg Discovered in Florida",
        ],
        "text": [
            "In a groundbreaking discovery, scientists have found a new species of elephant that can fly. The flying elephants, which were found in the Amazon rainforest, have wings that span over 50 feet and can reach speeds of up to 100 miles per hour. This is a game-changing discovery that could revolutionize the field of zoology.",
            "In a historic moment for humanity, aliens have landed on Earth and offered to solve all our problems. The extraterrestrial visitors, who arrived in a giant spaceship that landed in Central Park, have advanced technology that can cure disease, end hunger, and reverse climate change. The world is waiting to see how this incredible offer will play out.",
            "A new study has found that eating pizza every day can lead to a longer life. The study, which was conducted by a team of Italian researchers, looked at the eating habits of over 10,000 people and found that those who ate pizza regularly lived on average two years longer than those who didn't. The study has been hailed as a breakthrough in the field of nutrition.",
            "In a surprising twist, a new study has found that smoking is actually good for you. The study, which was conducted by a team of British researchers, looked at the health outcomes of over 100,000 people and found that those who smoked regularly had lower rates of heart disease and cancer than those who didn't. The findings have sparked controversy among health experts.",
            "In a bizarre turn of events, the world's largest iceberg has been discovered in Florida. The iceberg, which is over 100 miles long and 50 miles wide, was found off the coast of Miami by a group of tourists on a whale-watching tour. Scientists are baffled by the discovery and are scrambling to figure out how an iceberg of this size could have",
        ],
    }
)

In [3]:
pl.Config.set_fmt_str_lengths(100)

polars.config.Config

In [4]:
df

publication,date,title,text
str,str,str,str
"""The Daily Deception""","""2022-01-01""","""Scientists Discover New Species of Flying Elephant""","""In a groundbreaking discovery, scientists have found a new species of elephant that can fly. The fly…"
"""Faux News Network""","""2022-01-03""","""Aliens Land on Earth and Offer to Solve All Our Problems""","""In a historic moment for humanity, aliens have landed on Earth and offered to solve all our problems…"
"""The Fabricator""","""2022-01-04""","""Study Shows That Eating Pizza Every Day Leads to Longer Life""","""A new study has found that eating pizza every day can lead to a longer life. The study, which was co…"
"""The Misleader""","""2022-01-05""","""New Study Finds That Smoking is Good for You""","""In a surprising twist, a new study has found that smoking is actually good for you. The study, which…"
"""The Hoax Herald""","""2022-01-06""","""World's Largest Iceberg Discovered in Florida""","""In a bizarre turn of events, the world's largest iceberg has been discovered in Florida. The iceberg…"


## The `.str` namespace
Polars has a `.str` [namespace](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/string.html) to group string expressions together.

## Changing case

In [None]:
df.select(
    pl.col("title").str.to_uppercase()
).head(3)

title
str
"""SCIENTISTS DISCOVER NEW SPECIES OF FLYING ELEPHANT"""
"""ALIENS LAND ON EARTH AND OFFER TO SOLVE ALL OUR PROBLEMS"""
"""STUDY SHOWS THAT EATING PIZZA EVERY DAY LEADS TO LONGER LIFE"""


## Length of strings

In [6]:
df.select(
    pl.col("title").str.len_chars().alias("len_chars"),
    pl.col("title").str.len_bytes().alias("len_bytes")
)

len_chars,len_bytes
u32,u32
50,50
56,56
60,60
44,44
45,45


In [7]:
pl.DataFrame({"title": ["Holocene", "22 (OVER S∞∞N)"]}).select(
    len_chars=pl.col("title").str.len_chars(),
    len_bytes=pl.col("title").str.len_bytes(),
)

len_chars,len_bytes
u32,u32
8,8
14,18


## Remove whitespace

- `strip_chars_start` removes the leading white space.
- `strip_chars_end` removes the trailing white space.
- `strip_chars` removes white space of both side.

In [8]:
pl.DataFrame(
        {"foo": [" lead", "trail ", " both "]}
    )\
.select(
    pl.col("foo").str.strip_chars_start()
)

foo
str
"""lead"""
"""trail """
"""both """


## Justify and padding
We can return a string justified to a certain length with a padding character.

In [9]:
pl.DataFrame({"foo": [" lead", "trail ", " both "]})\
.select(
    "foo",
    pl.col("foo").str.pad_end(length=6, fill_char="*").alias("left_justified")
)

foo,left_justified
str,str
""" lead""",""" lead*"""
"""trail ""","""trail """
""" both """,""" both """


zero-padding with `zfill`

In [15]:
pl.DataFrame({"foo": [17, 591, 935]})\
.select(
    pl.col("foo").cast(pl.String).str.zfill(5)
)

foo
str
"""00017"""
"""00591"""
"""00935"""


## Splitting text

We can split text into a `pl.List` dtype column with the `str.split` method.

In [17]:
df.with_columns(
    pl.col("text").str.split(" ")
)

publication,date,title,text
str,str,str,list[str]
"""The Daily Deception""","""2022-01-01""","""Scientists Discover New Species of Flying Elephant""","[""In"", ""a"", … ""zoology.""]"
"""Faux News Network""","""2022-01-03""","""Aliens Land on Earth and Offer to Solve All Our Problems""","[""In"", ""a"", … ""out.""]"
"""The Fabricator""","""2022-01-04""","""Study Shows That Eating Pizza Every Day Leads to Longer Life""","[""A"", ""new"", … ""nutrition.""]"
"""The Misleader""","""2022-01-05""","""New Study Finds That Smoking is Good for You""","[""In"", ""a"", … ""experts.""]"
"""The Hoax Herald""","""2022-01-06""","""World's Largest Iceberg Discovered in Florida""","[""In"", ""a"", … ""have""]"


In [19]:
df.with_columns(
    pl.col("text").str.split(" ")
).explode("text")

publication,date,title,text
str,str,str,str
"""The Daily Deception""","""2022-01-01""","""Scientists Discover New Species of Flying Elephant""","""In"""
"""The Daily Deception""","""2022-01-01""","""Scientists Discover New Species of Flying Elephant""","""a"""
"""The Daily Deception""","""2022-01-01""","""Scientists Discover New Species of Flying Elephant""","""groundbreaking"""
"""The Daily Deception""","""2022-01-01""","""Scientists Discover New Species of Flying Elephant""","""discovery,"""
"""The Daily Deception""","""2022-01-01""","""Scientists Discover New Species of Flying Elephant""","""scientists"""
…,…,…,…
"""The Hoax Herald""","""2022-01-06""","""World's Largest Iceberg Discovered in Florida""","""of"""
"""The Hoax Herald""","""2022-01-06""","""World's Largest Iceberg Discovered in Florida""","""this"""
"""The Hoax Herald""","""2022-01-06""","""World's Largest Iceberg Discovered in Florida""","""size"""
"""The Hoax Herald""","""2022-01-06""","""World's Largest Iceberg Discovered in Florida""","""could"""


Cast the repeated values to reduce the memory usage.

In [20]:
df.with_columns(
    pl.col(["publication", "title"]).cast(pl.Categorical),
    pl.col("text").str.split(" ")
).explode("text")

publication,date,title,text
cat,str,cat,str
"""The Daily Deception""","""2022-01-01""","""Scientists Discover New Species of Flying Elephant""","""In"""
"""The Daily Deception""","""2022-01-01""","""Scientists Discover New Species of Flying Elephant""","""a"""
"""The Daily Deception""","""2022-01-01""","""Scientists Discover New Species of Flying Elephant""","""groundbreaking"""
"""The Daily Deception""","""2022-01-01""","""Scientists Discover New Species of Flying Elephant""","""discovery,"""
"""The Daily Deception""","""2022-01-01""","""Scientists Discover New Species of Flying Elephant""","""scientists"""
…,…,…,…
"""The Hoax Herald""","""2022-01-06""","""World's Largest Iceberg Discovered in Florida""","""of"""
"""The Hoax Herald""","""2022-01-06""","""World's Largest Iceberg Discovered in Florida""","""this"""
"""The Hoax Herald""","""2022-01-06""","""World's Largest Iceberg Discovered in Florida""","""size"""
"""The Hoax Herald""","""2022-01-06""","""World's Largest Iceberg Discovered in Florida""","""could"""


In [21]:
df.with_columns(
    pl.col("text").str.split(" ")
).explode("text")["text"]\
.value_counts(sort=True)

text,count
str,u32
"""a""",14
"""of""",13
"""that""",9
"""The""",8
"""the""",8
…,…
"""scrambling""",1
"""figure""",1
"""out""",1
"""an""",1


In [None]:
# split a string column to each character

df.select(
    pl.col("publication").str.split("").explode()
).head(6)

publication
str
"""T"""
"""h"""
"""e"""
""" """
"""D"""
"""a"""


## Merging string columns to create a new column

In [23]:
df.with_columns(
    title_data = pl.concat_str(
        [
            pl.col("title"),
            pl.col("date").cast(pl.Utf8)
        ],
        separator="_"
    )
).head(2)

publication,date,title,text,title_data
str,str,str,str,str
"""The Daily Deception""","""2022-01-01""","""Scientists Discover New Species of Flying Elephant""","""In a groundbreaking discovery, scientists have found a new species of elephant that can fly. The fly…","""Scientists Discover New Species of Flying Elephant_2022-01-01"""
"""Faux News Network""","""2022-01-03""","""Aliens Land on Earth and Offer to Solve All Our Problems""","""In a historic moment for humanity, aliens have landed on Earth and offered to solve all our problems…","""Aliens Land on Earth and Offer to Solve All Our Problems_2022-01-03"""


# Exercises

## Exercise 1
Clean the data so that 
- the data in the `id` column is homogenous with values `A` and `B`
- sort the `DataFrame` by zero-padded strings in the `values` column (without casting to integers)
- add a column to count how many characters there are in the `values` column

In [27]:
pl.DataFrame(
    {
        "id": ["A", "B", "a", "b"],
        "values": ["20", "5", " 13", "40"],
    }
).with_columns(
    pl.col("id").str.to_uppercase(),
    pl.col("values").str.strip_chars().str.zfill(2)
).with_columns(
    pl.col("values").str.len_chars().alias("len_chars")
).sort("values")

id,values,len_chars
str,str,u32
"""B""","""05""",2
"""A""","""13""",2
"""A""","""20""",2
"""B""","""40""",2


### Exercise 2
Clean the `origin` column of this `DataFrame` so that you can count how many records come from each city

In [28]:
df_origin = pl.DataFrame(
    [
        {"origin": "New York   ", "age": 25},
        {"origin": "Los Angeles", "age": 31},
        {"origin": "  miami", "age": 47},
        {"origin": "  Chicago  ", "age": 19},
        {"origin": "   boston   ", "age": 55},
        {"origin": " New York   ", "age": 28},
        {"origin": "los Angeles", "age": 11},
        {"origin": "Miami", "age": 27},
        {"origin": "  chicago  ", "age": 31},
        {"origin": "  Boston   ", "age": 45},
        {"origin": "new york", "age": 25},
    ]
)

The output should look like this:

In [29]:
pl.DataFrame(
    [
        {"origin": "new york", "counts": 3},
        {"origin": "los angeles", "counts": 2},
        {"origin": "miami", "counts": 2},
        {"origin": "chicago", "counts": 2},
        {"origin": "boston", "counts": 2},
    ]
)

origin,counts
str,i64
"""new york""",3
"""los angeles""",2
"""miami""",2
"""chicago""",2
"""boston""",2


In [30]:
df_origin.with_columns(
    pl.col("origin").str.strip_chars().str.to_uppercase()
)["origin"].value_counts(sort=True)

origin,count
str,u32
"""NEW YORK""",3
"""LOS ANGELES""",2
"""MIAMI""",2
"""CHICAGO""",2
"""BOSTON""",2


### Exercise 3

Clean and then justify the text to have 4-digit years.

In [31]:
(
    pl.DataFrame(
        {"year": ["2022", "21", "22 "]}
    )
    .select(
        pl.col("year").str.strip_chars().str.pad_start(3, "0").str.pad_start(4, "2")
    )
)

year
str
"""2022"""
"""2021"""
"""2022"""


### Exercise 4
Split the `id` column into a `pl.Struct` column called `struct_col` with 3 fields. 

In [37]:
(
    pl.DataFrame(
        [
            {"id": "AAA-BBB-2"},
            {"id": "AAA-BBB-3"},
            {"id": "AAA-CCC-2"},
            {"id": "AAA-DDD-3"},
            {"id": "AAA-BBB-4"},
        ]
    )
    .with_columns(
        struct_col = pl.col("id").str.split_exact("-", n=2)
    )
)

id,struct_col
str,struct[3]
"""AAA-BBB-2""","{""AAA"",""BBB"",""2""}"
"""AAA-BBB-3""","{""AAA"",""BBB"",""3""}"
"""AAA-CCC-2""","{""AAA"",""CCC"",""2""}"
"""AAA-DDD-3""","{""AAA"",""DDD"",""3""}"
"""AAA-BBB-4""","{""AAA"",""BBB"",""4""}"


Convert the struct fields into columns of the `DataFrame`

In [39]:
(
    pl.DataFrame(
        [
            {"id": "AAA-BBB-2"},
            {"id": "AAA-BBB-3"},
            {"id": "AAA-CCC-2"},
            {"id": "AAA-DDD-3"},
            {"id": "AAA-BBB-4"},
        ]
    )
    .with_columns(
        struct_col = pl.col("id").str.split_exact("-", n=2)
    )
    .unnest(columns="struct_col")
)

id,field_0,field_1,field_2
str,str,str,str
"""AAA-BBB-2""","""AAA""","""BBB""","""2"""
"""AAA-BBB-3""","""AAA""","""BBB""","""3"""
"""AAA-CCC-2""","""AAA""","""CCC""","""2"""
"""AAA-DDD-3""","""AAA""","""DDD""","""3"""
"""AAA-BBB-4""","""AAA""","""BBB""","""4"""


## Exercise 5
We create a `DataFrame` from the Spotify data

In [40]:
pl.Config.set_fmt_str_lengths(100)
pl.Config.set_tbl_rows(10)
spotify_csv = "data/spotify-charts-2017-2021-global-top200.csv.gz"
spotify_df = pl.read_csv(spotify_csv,try_parse_dates=True)
spotify_df.head(3)

title,rank,date,artist,url,region,chart,trend,streams
str,i64,date,str,str,str,str,str,i64
"""Starboy""",1,2017-01-01,"""The Weeknd, Daft Punk""","""https://open.spotify.com/track/5aAx2yezTd8zXrkmtKl66Z""","""Global""","""top200""","""SAME_POSITION""",3135625
"""Closer""",2,2017-01-01,"""The Chainsmokers, Halsey""","""https://open.spotify.com/track/7BKLCZ1jbUBVqRi2FVlTVw""","""Global""","""top200""","""SAME_POSITION""",3015525
"""Let Me Love You""",3,2017-01-01,"""DJ Snake, Justin Bieber""","""https://open.spotify.com/track/4pdPtRcBmOSQDlJ3Fk945m""","""Global""","""top200""","""MOVE_UP""",2545384


Let's find out what makes for long track titles.

- Keep one row for every unique track (with uniqueness defined by title and artist) 
- Add columns with the length of the title column in characters (`len_chars`) and bytes (`len_chars`)
- Find the 10 tracks with the longest titles by number of characters

In [41]:
(
    spotify_df
    .unique(["title", "artist"])
    .with_columns(
        len_chars = pl.col("title").str.len_chars(),
        len_bytes = pl.col("title").str.len_bytes(),
    )
    .sort(by="len_chars", descending=True)
    .head(10)
    .select("title","artist","len_chars","len_bytes")
)

title,artist,len_chars,len_bytes
str,str,u32,u32
"""Wu Tang Forever (ft. Ghostface Killah, Raekwon, RZA, Method Man, Inspectah Deck, Cappadonna, Jackpot…","""Logic""",139,139
"""Don't Shoot (feat. Rick Ross, 2 Chainz, Diddy, Fabolous, Wale, DJ Khaled, Swizz Beatz, Yo Gotti, Cur…","""The Game, Curren$y""",135,135
"""Costa Rica (with Bas & JID feat. Guapdad 4000, Reese LAFLARE, Jace, Mez, Smokepurpp, Buddy & Ski Mas…","""Dreamville""",116,116
"""I Don’t Wanna Live Forever (Fifty Shades Darker) - From ""Fifty Shades Darker (Original Motion Pictur…","""ZAYN, Taylor Swift""",114,116
"""Waka Waka (This Time for Africa) [The Official 2010 FIFA World Cup (TM) Song] (feat. Freshlyground)""","""Shakira""",99,99
"""Sucker for Pain (with Wiz Khalifa, Imagine Dragons, Logic & Ty Dolla $ign feat. X Ambassadors)""","""Lil Wayne""",94,94
"""Dominick the Donkey (The Italian Christmas Donkey) [with Joe Reisman's Orchestra and Chorus]""","""Lou Monte""",92,92
"""Body (Remix) [feat. ArrDee, E1 (3x3), ZT (3x3), Bugzy Malone, Buni, Fivio Foreign & Darkoo]""","""Tion Wayne, Russ Millions""",91,91
"""Welcome to the Party (with French Montana & Lil Pump, feat. Zhavia Ward) - from Deadpool 2""","""Diplo""",90,90
"""Medley: Caroling, Caroling / The First Noel / Hark! The Herald Angels Sing / Silent Night""","""Perry Como""",89,89


When do we get the biggest difference between the title representation in characters and bytes?

- Add a column called `diff` with the difference in the number of bytes and characters in the title
- Keep only tracks where the difference is greater than 0
- Show the 10 tracks with the biggest difference

In [43]:
(
    spotify_df
    .unique(["title", "artist"])
    .with_columns(
        len_chars = pl.col("title").str.len_chars().cast(pl.Int32),
        len_bytes = pl.col("title").str.len_bytes().cast(pl.Int32),
    )
    .with_columns(
        diff = pl.col("len_bytes") - pl.col("len_chars")
    )
    .filter(
        pl.col("diff") > 0
    )
    .sort(by="diff", descending=True)
    .head(10)
    .select("title","artist","len_chars","len_bytes","diff")
)

title,artist,len_chars,len_bytes,diff
str,str,i32,i32,i32
"""美女と野獣""","""Ariana Grande, John Legend""",5,15,10
"""Daydream (백일몽)""","""j-hope""",14,20,6
"""Benz Truck - гелик""","""Lil Peep""",18,23,5
"""That’s When (feat. Keith Urban) (Taylor’s Version) (From The Vault)""","""Taylor Swift""",67,71,4
"""1985 - Intro to “The Fall Off”""","""J. Cole""",30,34,4
"""Rohdiamant ٢٠٢٠""","""Samra""",15,19,4
"""You’re Not Sorry (Taylor’s Version)""","""Taylor Swift""",35,39,4
"""Don’t You (Taylor’s Version) (From The Vault)""","""Taylor Swift""",45,49,4
"""Don’t Call Me Angel (Charlie’s Angels) (with Miley Cyrus & Lana Del Rey)""","""Ariana Grande""",72,76,4
"""fuck, i'm lonely (with Anne-Marie) - from “13 Reasons Why: Season 3”""","""Lauv""",68,72,4
