| What problems should we worry about?        | What can we do about these problems? |
| ------------------------------------------- | ------------------------------------ |
| `Salary Estimate` and `Revenue` are strings | Convert to tuples of numbers         |


In [82]:
import pandas

df = pandas.read_csv("./2.3.csv")
df.head()

Unnamed: 0,job_title,salary,description,rating,company,Location,headquarters,size,founded,ownership,industry,sector,revenue
0,Sr Data Scientist,$137K-$171K (Glassdoor est.),Description\n\nThe Senior Data Scientist is re...,3.1,Healthfirst,"New York, NY","New York, NY","('1001', '5000')",1993.0,Nonprofit Organization,Insurance Carriers,Insurance,
1,Data Scientist,$137K-$171K (Glassdoor est.),"Secure our Nation, Ignite your Future\n\nJoin ...",4.2,ManTech,"Chantilly, VA","Herndon, VA","('5001', '10000')",1968.0,Company - Public,Research & Development,Business Services,$1 to $2 billion (USD)
2,Data Scientist,$137K-$171K (Glassdoor est.),Overview\n\n\nAnalysis Group is one of the lar...,3.8,Analysis Group,"Boston, MA","Boston, MA","('1001', '5000')",1981.0,Private Practice / Firm,Consulting,Business Services,$100 to $500 million (USD)
3,Data Scientist,$137K-$171K (Glassdoor est.),JOB DESCRIPTION:\n\nDo you have a passion for ...,3.5,INFICON,"Newton, MA","Bad Ragaz, Switzerland","('501', '1000')",2000.0,Company - Public,Electrical & Electronic Manufacturing,Manufacturing,$100 to $500 million (USD)
4,Data Scientist,$137K-$171K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions,"New York, NY","New York, NY","('51', '200')",1998.0,Company - Private,Advertising & Marketing,Business Services,


In [83]:
df["revenue"].value_counts()

$100 to $500 million (USD)          94
$10+ billion (USD)                  63
$2 to $5 billion (USD)              45
$10 to $25 million (USD)            41
$25 to $50 million (USD)            36
$1 to $2 billion (USD)              36
$1 to $5 million (USD)              31
$50 to $100 million (USD)           31
$500 million to $1 billion (USD)    19
Less than $1 million (USD)          14
$5 to $10 million (USD)             14
$5 to $10 billion (USD)              8
Name: revenue, dtype: int64

In [84]:
from typing import List, Union


def handle_revenue(number_info: List[str]):
    if number_info in [["Less", "than"], []]:
        return ""
    number = int("".join([digit for digit in number_info[0] if digit.isnumeric()]))
    shift = 1_000_000 if "million" in number_info else 1_000_000_000
    return str(number * shift)


def handle_revenues(revenue: Union[str, float]):
    if not isinstance(revenue, str):
        return revenue
    segments = revenue.split()
    segments.remove("(USD)")
    if "to" in segments:
        segments.remove("to")

    lower = segments[:-2]
    higher = segments[-2:]
    if not len(lower):
        lower = higher
        higher = []

    return (handle_revenue(lower), handle_revenue(higher))


df["revenue"] = df["revenue"].apply(handle_revenues)
df["revenue"].value_counts()

(100000000000, 500000000)    94
(10000000000, )              63
(2000000000, 5000000000)     45
(10000000000, 25000000)      41
(1000000000, 2000000000)     36
(25000000000, 50000000)      36
(1000000000, 5000000)        31
(50000000000, 100000000)     31
(500000000, 1000000000)      19
(5000000000, 10000000)       14
(, 1000000)                  14
(5000000000, 10000000000)     8
Name: revenue, dtype: int64

In [85]:
df["salary"].value_counts()

$79K-$131K (Glassdoor est.)     32
$75K-$131K (Glassdoor est.)     32
$99K-$132K (Glassdoor est.)     32
$90K-$109K (Glassdoor est.)     30
$137K-$171K (Glassdoor est.)    30
$90K-$124K (Glassdoor est.)     22
$56K-$97K (Glassdoor est.)      22
$79K-$106K (Glassdoor est.)     22
$128K-$201K (Glassdoor est.)    21
$95K-$119K (Glassdoor est.)     21
$122K-$146K (Glassdoor est.)    21
$110K-$163K (Glassdoor est.)    21
$91K-$150K (Glassdoor est.)     21
$124K-$198K (Glassdoor est.)    21
$112K-$116K (Glassdoor est.)    21
$92K-$155K (Glassdoor est.)     21
$138K-$158K (Glassdoor est.)    21
$69K-$116K (Glassdoor est.)     21
$101K-$165K (Glassdoor est.)    21
$212K-$331K (Glassdoor est.)    21
$31K-$56K (Glassdoor est.)      20
$141K-$225K (Glassdoor est.)    20
$145K-$225K(Employer est.)      20
$79K-$147K (Glassdoor est.)     20
$87K-$141K (Glassdoor est.)     20
$80K-$132K (Glassdoor est.)     20
$105K-$167K (Glassdoor est.)    20
$66K-$112K (Glassdoor est.)     20
$71K-$123K (Glassdoo

In [86]:
def handle_salary(number_info: str):
    number = int("".join([digit for digit in number_info if digit.isnumeric()]))
    return str(number * 1000)


def handle_salaries(salary: Union[str, float]):
    if not isinstance(salary, str):
        return salary
    segments = salary.split()[0].split("-")

    return (handle_salary(segments[0]), handle_salary(segments[1]))


df["salary"] = df["salary"].apply(handle_salaries)
df["salary"].value_counts()

(75000, 131000)     32
(99000, 132000)     32
(79000, 131000)     32
(137000, 171000)    30
(90000, 109000)     30
(79000, 106000)     22
(90000, 124000)     22
(56000, 97000)      22
(128000, 201000)    21
(110000, 163000)    21
(101000, 165000)    21
(91000, 150000)     21
(95000, 119000)     21
(92000, 155000)     21
(69000, 116000)     21
(212000, 331000)    21
(138000, 158000)    21
(112000, 116000)    21
(122000, 146000)    21
(124000, 198000)    21
(79000, 147000)     20
(66000, 112000)     20
(80000, 132000)     20
(31000, 56000)      20
(105000, 167000)    20
(141000, 225000)    20
(145000, 225000)    20
(87000, 141000)     20
(71000, 123000)     19
(79000, 133000)     19
Name: salary, dtype: int64

In [87]:
df

Unnamed: 0,job_title,salary,description,rating,company,Location,headquarters,size,founded,ownership,industry,sector,revenue
0,Sr Data Scientist,"(137000, 171000)",Description\n\nThe Senior Data Scientist is re...,3.1,Healthfirst,"New York, NY","New York, NY","('1001', '5000')",1993.0,Nonprofit Organization,Insurance Carriers,Insurance,
1,Data Scientist,"(137000, 171000)","Secure our Nation, Ignite your Future\n\nJoin ...",4.2,ManTech,"Chantilly, VA","Herndon, VA","('5001', '10000')",1968.0,Company - Public,Research & Development,Business Services,"(1000000000, 2000000000)"
2,Data Scientist,"(137000, 171000)",Overview\n\n\nAnalysis Group is one of the lar...,3.8,Analysis Group,"Boston, MA","Boston, MA","('1001', '5000')",1981.0,Private Practice / Firm,Consulting,Business Services,"(100000000000, 500000000)"
3,Data Scientist,"(137000, 171000)",JOB DESCRIPTION:\n\nDo you have a passion for ...,3.5,INFICON,"Newton, MA","Bad Ragaz, Switzerland","('501', '1000')",2000.0,Company - Public,Electrical & Electronic Manufacturing,Manufacturing,"(100000000000, 500000000)"
4,Data Scientist,"(137000, 171000)",Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions,"New York, NY","New York, NY","('51', '200')",1998.0,Company - Private,Advertising & Marketing,Business Services,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,Data Scientist,"(105000, 167000)",Summary\n\nWe’re looking for a data scientist ...,3.6,TRANZACT,"Fort Lee, NJ","Fort Lee, NJ","('1001', '5000')",1989.0,Company - Private,Advertising & Marketing,Business Services,
668,Data Scientist,"(105000, 167000)",Job Description\nBecome a thought leader withi...,,JKGT,"San Francisco, CA",,,,,,,
669,Data Scientist,"(105000, 167000)",Join a thriving company that is changing the w...,,AccessHope,"Irwindale, CA",,,,,,,
670,Data Scientist,"(105000, 167000)",100 Remote Opportunity As an AINLP Data Scient...,5.0,ChaTeck Incorporated,"San Francisco, CA","Santa Clara, CA","('1', '50')",,Company - Private,Advertising & Marketing,Business Services,"(1000000000, 5000000)"


In [88]:
df.to_csv("./2.4.csv", index=False)