In [79]:
"""
This program will take the pasted input from a webpage 
Then use splitting and regex to extract the critical information into a dataframe
"""


from distutils.command.clean import clean
import os
import pandas as pd
import datetime as dt


def clean_text(text_file):
    """
    This function will take the text file and clean it up
    The columns of the dataframe will be:
    Organization Name, Funding Type, Money Raised, Announced Date,
    Organization Description, Organization Industries, Organization Location

    The raw data starts like:
    1.
    Cruise Logo
    Cruise --> Organization Name
    Corporate Round --> Funding Type
    $1,350,000,000 --> Money Raised
    Mar 18, 2022 --> Announced Date
    Corporate Round - Cruise Logo
    Corporate Round - Cruise
    Cruise builds self-driving vehicles that safely connect people to places, things, and experiences they care about. --> Organization Description
    Artificial Intelligence, Automotive, Autonomous Vehicles, Robotics, Software, Transportation --> Organization Industries
    San Francisco, California, United States, North America --> Organization Location
    """
    # open the text file and loop through it line by line
    with open(text_file, "r", encoding="utf8") as f:
        # create an empty data frame with the columns we want
        df = pd.DataFrame(
            columns=[
                "Organization Name",
                "Funding Type",
                "Money Raised",
                "Announced Date",
                "Organization Description",
                "Organization Industries",
                "Organization Location",
                "Organization Country"
            ]
        )
        data_frame_entry = []
        count = 0
        # loop through all lines in the file
        for line in data:
            # reset count to 0 and add the line to the dataframe after going through one entry
            if count == 11:
                count = 0
                # add data_frame_entry as a row to the dataframe
                df.loc[len(df)] = data_frame_entry
                # reset data_frame_entry to an empty list
                data_frame_entry = []
            if count == 1:
                # check if the line contains the word "Logo" else we know we need to add it
                if "Logo" not in line:
                    count -= 1

            if count == 2:# org name
                data_frame_entry.append(line.strip())
            if count == 3: # funding type
                data_frame_entry.append(line.strip())
            if count == 4: # money raised
                money_raised = convert_money(line.strip())                
                data_frame_entry.append(money_raised)
            if count == 5: # announced date
                date = line.strip()
                date = convert_date(date)
                data_frame_entry.append(date)
            if count == 8: # description
                data_frame_entry.append(line.strip())
            if count == 9: # industries
                data_frame_entry.append(line.strip())
            if count == 10: # location
                data_frame_entry.append(line.strip())
                if line.strip() == "—":
                    country = line.strip()
                else:
                    country = line.strip().split(",")[-2].strip()
                data_frame_entry.append(country)
            count += 1
    return df

# clean the text file 'data1.txt' in the folder webscape subfolder Data added to the current working directory
# get current working directory
cwd = os.getcwd()
file = cwd + "\\webscrape\\Data\\data1.txt"
file = 'data1.txt'


with open(file) as fp:
    data_raw = fp.read()
    # data = data_raw.split('\n\n')
df = clean_text(file)
# add a column for country by splitting the location column with ', ' and taking the second to last element



In [57]:
''.join(data_raw)[:100]

'1.\n\nCruise Logo\n\nCruise\n\nCorporate Round\n\n$1,350,000,000\n\nMar 18, 2022\n\nCorporate Round - Cruise Log'

In [119]:
import re

def get_data():
    row = []
    for line in data_raw.split('\n\n'):
        line = line.rstrip()
        if re.match('\d+\.',line):
            if row:
                yield row
            row = []
        row.append(line)
            
rows = get_data()
df = pd.DataFrame(rows)
del df[0]
del df[1]
del df[6]
del df[7]
df.columns = [
                "Organization Name",
                "Funding Type",
                "Money Raised",
                "Announced Date",
                "Organization Description",
                "Organization Industries",
                "Organization Location",
            ]
# convert money raised to an int
df['amount'] = df['Money Raised'].map(lambda s: int(''.join(re.findall('\d', s))))
df = df[df['Money Raised'].str.contains('$')]
df.head()

Unnamed: 0,Organization Name,Funding Type,Money Raised,Announced Date,Organization Description,Organization Industries,Organization Location,amount
0,Cruise,Corporate Round,"$1,350,000,000","Mar 18, 2022",Cruise builds self-driving vehicles that safel...,"Artificial Intelligence, Automotive, Autonomou...","San Francisco, California, United States, Nort...",1350000000
1,WeRide,Series D,"$400,000,000","Mar 24, 2022",WeRide is a leading Level 4 autonomous driving...,"Artificial Intelligence, Autonomous Vehicles, ...","Guangzhou, Guangdong, China, Asia",400000000
2,Uniphore,Series E,"$400,000,000","Feb 16, 2022",Uniphore is a customer service platform that i...,"Artificial Intelligence, Customer Service, Mac...","Palo Alto, California, United States, North Am...",400000000
3,Clari,Series F,"$225,000,000","Jan 19, 2022",Clari is a revenue operations company that pro...,"Analytics, Artificial Intelligence, Business I...","Sunnyvale, California, United States, North Am...",225000000
4,Soterea,Series B,"CN¥1,300,000,000","Mar 28, 2022",Soterea develops an intelligent driving system.,"Artificial Intelligence, Automotive, Autonomou...","Tianjin, Guangdong, China, Asia",1300000000


ValueError: could not convert string to Timestamp

0      1350000000
1       400000000
2       400000000
3       225000000
4      1300000000
          ...    
544         50000
545         50000
546         50000
547         35000
548         30000
Name: 4, Length: 549, dtype: int64

In [76]:
# This function adds a column called "Main Industry" to the dataframe for coloring via graph
# It will go through each row's industries and assign whatever industry has the highest value
def assign_main_industry(df, industry_total):
    # add a column to the dataframe called "Main Industry"
    df["Main Industry"] = ""
    # sort the dictionary by value
    sorted_industries = sorted(industry_total.items(), key=lambda x: x[1], reverse=True)
    for idx, long_industry in enumerate(df["Organization Industries"]):
        # split the long string on every ", "
        industries = long_industry.split(", ")
        for idx, industry in enumerate(sorted_industries):
            if industry[0] in industries:
                df["Main Industry"][idx] = industry[0]
    return df

def convert_money(money_raised):
    """
    This function will take the money raised convert it to a float and into the value of USDs
    """        
    if money_raised.startswith("$"):
        # remove the $ sign and all commas then convert to numeric
        money_raised = money_raised.replace("$", "").replace(",", "").replace(" ", "")
        money_raised = float(money_raised)
    elif money_raised.startswith("CN¥"):
        money_raised = money_raised.replace("CN¥", "").replace(",", "").replace(" ", "")
        money_raised = float(money_raised)
        money_raised = money_raised * 0.156961
    elif money_raised.startswith("TRY"):
        money_raised = money_raised.replace("TRY", "").replace(",", "").replace(" ", "")
        money_raised = float(money_raised)
        money_raised = money_raised * 0.068
    elif money_raised.startswith("£"):
        money_raised = money_raised.replace("£", "").replace(",", "").replace(" ", "")
        money_raised = float(money_raised)
        money_raised = money_raised * 1.31
    elif money_raised.startswith("A$"):
        money_raised = money_raised.replace("A$", "").replace(",", "").replace(" ", "")
        money_raised = float(money_raised)
        money_raised = money_raised * 0.740222 
    elif money_raised.startswith("CA$"):
        money_raised = money_raised.replace("CA$", "").replace(",", "").replace(" ", "")
        money_raised = float(money_raised)
        money_raised = money_raised * 0.792823 
    elif money_raised.startswith("¥"):
        money_raised = money_raised.replace("¥", "").replace(",", "").replace(" ", "")
        money_raised = float(money_raised)
        money_raised = money_raised * 0.0079
    elif money_raised.startswith("₹"):
        money_raised = money_raised.replace("₹", "").replace(",", "").replace(" ", "")
        money_raised = float(money_raised)
        money_raised = money_raised * 0.013
    elif money_raised.startswith("SEK"):
        money_raised = money_raised.replace("SEK", "").replace(",", "").replace(" ", "")
        money_raised = float(money_raised)
        money_raised = money_raised * 0.10
    elif money_raised.startswith("€"):
        money_raised = money_raised.replace("€", "").replace(",", "").replace(" ", "")
        money_raised = float(money_raised)
        money_raised = money_raised * 1.08
    elif money_raised.startswith("₩"):
        money_raised = money_raised.replace("₩", "").replace(",", "").replace(" ", "")
        money_raised = float(money_raised)
        money_raised = money_raised * 0.00081
    elif money_raised.startswith("SGD"):
        money_raised = money_raised.replace("SGD", "").replace(",", "").replace(" ", "")
        money_raised = float(money_raised)
        money_raised = money_raised * 0.74
    elif money_raised.startswith("R$"):
        money_raised = money_raised.replace("R$", "").replace(",", "").replace(" ", "")
        money_raised = float(money_raised)
        money_raised = money_raised * 0.212761
    elif money_raised.startswith("CHF"):
        money_raised = money_raised.replace("CHF", "").replace(",", "").replace(" ", "")
        money_raised = float(money_raised)
        money_raised = money_raised * 1.06
    return money_raised

def convert_date(the_date):
    """
    This function will take a date from a format of "Mar 18, 2022" and convert it to a datetime object
    """
    # split the date into its components
    month_day, year = the_date.split(", ")
    month, day = month_day.split(" ")
    # create a datetime object
    date_object = dt.datetime(int(year), month_to_int(month), int(day))
    return date_object

def month_to_int(month):
    """
    This function will take a month and convert it to an integer
    """
    if month == "Jan":
        return 1
    elif month == "Feb":
        return 2
    elif month == "Mar":
        return 3
    elif month == "Apr":
        return 4
    elif month == "May":
        return 5
    elif month == "Jun":
        return 6
    elif month == "Jul":
        return 7
    elif month == "Aug":
        return 8
    elif month == "Sep":
        return 9
    elif month == "Oct":
        return 10
    elif month == "Nov":
        return 11
    elif month == "Dec":
        return 12

In [27]:
df.head()

Unnamed: 0,Organization Name,Funding Type,Money Raised,Announced Date,Organization Description,Organization Industries,Organization Location,Organization Country
0,Cruise,Corporate Round,1350000000.0,2022-03-18,Cruise builds self-driving vehicles that safel...,"Artificial Intelligence, Automotive, Autonomou...","San Francisco, California, United States, Nort...",United States
1,WeRide,Series D,400000000.0,2022-03-24,WeRide is a leading Level 4 autonomous driving...,"Artificial Intelligence, Autonomous Vehicles, ...","Guangzhou, Guangdong, China, Asia",China
2,Uniphore,Series E,400000000.0,2022-02-16,Uniphore is a customer service platform that i...,"Artificial Intelligence, Customer Service, Mac...","Palo Alto, California, United States, North Am...",United States
3,Clari,Series F,225000000.0,2022-01-19,Clari is a revenue operations company that pro...,"Analytics, Artificial Intelligence, Business I...","Sunnyvale, California, United States, North Am...",United States
4,Soterea,Series B,204049300.0,2022-03-28,Soterea develops an intelligent driving system.,"Artificial Intelligence, Automotive, Autonomou...","Tianjin, Guangdong, China, Asia",China


In [28]:
def industry_Calcs(df, country = None):
    if country:
        # get rows from the df that have the country we want
        temp_df = df[df["Organization Country"] == country].reset_index(drop=True)
    else:
        temp_df = df
    # first create a dictionary that will hold industries and their counts
    industry_counts = {}
    # also create a dictionary that will hold the industries and their total funds raised
    industry_total = {}
    # loop through df["Organization Industries"] and for each entry, 
    # if the industry is not in the dictionary, add it with a count of 1
    # if the industry is in the dictionary, increment the count by 1
    for idx, long_industry in enumerate(temp_df["Organization Industries"]):
        # split the long string on every ", "
        industries = long_industry.split(", ")
        for industry in industries:
            if industry not in industry_counts:
                industry_counts[industry] = 1
                # the industry total is the "Money Raised" from the row in df with the index idx
                industry_total[industry] = temp_df["Money Raised"][idx]
            else:
                industry_counts[industry] += 1
                industry_total[industry] += temp_df["Money Raised"][idx]
    # delete the entry "Artificial Intelligence", "Software" and "Machine Learning" because many share this
    del industry_counts["Artificial Intelligence"]
    del industry_counts["Software"]
    del industry_counts["Machine Learning"]
    del industry_total["Artificial Intelligence"]
    del industry_total["Software"]
    del industry_total["Machine Learning"]

    return industry_counts, industry_total
    

In [31]:
!jupyter labextension install jupyterlab-plotly@5.7.0 @jupyter-widgets/jupyterlab-manager

An error occurred.
ValueError: Please install Node.js and npm before continuing installation. You may be able to install Node.js from your package manager, from conda, or directly from the Node.js website (https://nodejs.org).
See the log file for details:  /var/folders/nw/lktt921x1wg_wt5tfy5q9k2c0000gn/T/jupyterlab-debug-edu6pcua.log


In [34]:
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.graph_objects as go

industry_counts, industry_total = industry_Calcs(df) 
industry_counts_USA, industry_total_USA = industry_Calcs(df, "United States")
industry_counts_China, industry_total_China = industry_Calcs(df, "China")

fig = make_subplots(rows=3, cols=1, subplot_titles= ["Total, Total in US, Total in China"])
fig.add_trace(go.Bar(x=list(industry_counts.keys()), y=sorted(list(industry_counts.values()), reverse=True), name="Total"), row=1, col=1)
fig.add_trace(go.Bar(x=list(industry_counts_USA.keys()), y=sorted(list(industry_counts_USA.values()), reverse=True), name="Total in US"), row=2, col=1)
fig.add_trace(go.Bar(x=list(industry_counts_China.keys()), y=sorted(list(industry_counts_China.values()), reverse=True), name="Total in China"), row=3, col=1)


# use plotly to create a bar chart of the industry counts sorted by count in darkmode
fig.update_layout(template="plotly_dark")
# make the x axis labels rotated a bit
fig.update_xaxes(title_text="Industries", tickangle=45)
fig.show()
# download the html of the plot
pio.write_html(fig, file="Charts/industry_counts_x3.html", auto_open=True)

# create one figure that has three subplots on for each country


In [33]:
!mkdir Charts

In [35]:
# make the same figure but with the total funds raised
fig = make_subplots(rows=3, cols=1, subplot_titles= ["Total, Total in US, Total in China"])
fig.add_trace(go.Bar(x=list(industry_total.keys()), y=sorted(list(industry_total.values()), reverse=True), name="Total"), row=1, col=1)
fig.add_trace(go.Bar(x=list(industry_total_USA.keys()), y=sorted(list(industry_total_USA.values()), reverse=True), name="Total in US"), row=2, col=1)
fig.add_trace(go.Bar(x=list(industry_total_China.keys()), y=sorted(list(industry_total_China.values()), reverse=True), name="Total in China"), row=3, col=1)
fig.update_layout(template="plotly_dark")
fig.update_xaxes(title_text="Industries", tickangle=45)
fig.show()

# download the html of the plot
pio.write_html(fig, file="Charts/industry_totals_x3.html", auto_open=True)

In [8]:
# use plotly to create a bar chart of the industry totals sorted by total in darkmode
fig = px.bar(x=list(industry_total.keys()), y=sorted(list(industry_total.values()), reverse=True), template="plotly_dark")
fig.show()
# download the html of the plot
pio.write_html(fig, file="Charts/industry_totals.html", auto_open=True)

NameError: name 'px' is not defined

In [9]:
df = assign_main_industry(df, industry_total)

NameError: name 'df' is not defined

In [10]:
# create a bubble plot with plotly of all Organizations and how much $ they've raised
# x = Announced Date, y = Organization Location, size = Money Raised, hovertext = Organization Name
fig = px.scatter(df, x="Announced Date", y="Main Industry", size="Money Raised", 
        hover_name="Organization Name", hover_data=["Organization Description"], color="Organization Country", template="plotly_dark")
fig.show()
# download the html of the plot
#pio.write_html(fig, file="Charts/plotly_bubble.html", auto_open=True)

NameError: name 'px' is not defined

In [168]:
# create the same bubble chart as above but only where Organization Country is "United States"
fig = px.scatter(df[df["Organization Country"] == "United States"], x="Announced Date", y="Main Industry", size="Money Raised",
        hover_name="Organization Name", hover_data=["Organization Description"], color="Organization Country", template="plotly_dark")
fig.show()