In [None]:
import pandas as pd  # For data manipulation and creating DataFrames
import requests  # For making HTTP requests to the API
from bs4 import BeautifulSoup as bs
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.corpus import stopwords as STOPWORDS
from nltk.tokenize import word_tokenize
from os import path
from PIL import Image
from wordcloud import WordCloud

stopwords = set(STOPWORDS.words('english'))  # Load the English stopwords from NLTK
stopwords.update(['across', 'help', 'skills', 'will'])  # Add custom stopwords

# Define the stopwords for text processing


def fetch_jobs_from_api(sites):
    """
    Fetches job listings from the APIs of the specified sites.

    Args:
        sites (list): A list of site names to fetch jobs from. Valid options are 'remoteok' and 'jobicy'.

    Returns:
        A dataframe of job listings if successful, None otherwise.
    """
    api_urls = {
        "remoteok": "https://remoteok.com/api",
        'jobicy': "https://jobicy.com/api/v2/remote-jobs"
    }
    
    for site in sites:
        if site not in api_urls:
            print(f"Error: {site} is not a valid site. Valid sites are: {', '.join(api_urls.keys())}")
            return None
        api_url = api_urls[site]
        print(f"Attempting to fetch data from: {api_url}")

        try:
            response = requests.get(api_url, timeout=10)  # Added timeout for robustness
            response.raise_for_status()  # Raises an HTTPError for bad responses (4XX or 5XX)

            data = response.json()
            if (isinstance(data, list) or isinstance(data, dict)) and len(data) > 0:
                if site == 'remoteok':
                    return parse_remoteok_jobs_to_structured_df(data)
                else:
                    return parse_jobicy_jobs_to_structured_df(data)

            elif (isinstance(data, list) or isinstance(data, dict)) and len(data) == 0:
                print("API returned an empty list of jobs.")
                return []
            else:
                print(f"Unexpected API response format. Expected a list, got {type(data)}.")
                return None

        except requests.exceptions.ConnectionError:
            print(f"Error: Connection error occurred while trying to reach {api_url}.")
            return None
        except requests.exceptions.TooManyRedirects:
            print(f"Error: Too many redirects while trying to reach {api_url}.")
            return None        
        except requests.exceptions.Timeout:
            print(f"Error: Request to {api_url} timed out.")
            return None
        except requests.exceptions.HTTPError as http_err:
            print(f"Error: HTTP error occurred: {http_err} - Status Code: {response.status_code}")
            return None
        except requests.exceptions.RequestException as req_err:
            print(f"Error: An error occurred while fetching data from API: {req_err}")
            return None
        except ValueError as json_err:  # Includes json.JSONDecodeError
            print(f"Error: Could not decode JSON response: {json_err}")
            return None


def parse_remoteok_jobs_to_structured_df(data):
    """
    Parses a list of job dictionaries (from API) into a pandas DataFrame.
    Selects relevant columns and performs basic data cleaning/transformation.

    Args:
        data (list): A list of dictionaries from the API.

    Returns:
        pandas.DataFrame: A DataFrame containing structured job data, or an empty DataFrame if input is invalid.
    """
    # The RemoteOK API returns a list. The first item is a "legal notice" or API info.
    # Actual job listings start from the second item.
    if data[0].get("legal") is not None:
        print(f"Skipping the first element (meta-data/legal): {data[0].get('legal')}")
        job_list = data[1:]
    else:
        # If the first element doesn't look like metadata, perhaps the API structure changed.
        # For now, we'll assume it's all job data.
        print("First element does not appear to be metadata. Processing all elements as jobs.")
        job_list = data
    
    if not job_list:
        print("No job data provided. Returning empty DataFrame.")
        return pd.DataFrame()

    print(f"Normalizing {len(job_list)} job entries into a DataFrame...")
    # Use pandas.json_normalize to flatten the JSON structures.
    df = pd.json_normalize(job_list)

    # --- Data Cleaning and Transformation ---

    # Define the columns we are interested in.
    desired_columns = [
        'id', 'company', 'position', 'tags', 'location', 'salary_min', 'salary_max'
    ]
    # Define the keywords to look for in job titles
    keywords = 'analy|data|machine learning|intelligence'

    # Select only the desired columns that are actually present in the DataFrame
    # This makes the script more robust to changes in the API response
    columns_to_select = [col for col in desired_columns if col in df.columns]

    if not columns_to_select:
        print("None of the desired columns were found in the API response. Returning empty DataFrame.")
        return pd.DataFrame()

    df_selected = df[columns_to_select][df['position'].str.contains(keywords, case=False) |
                                        df['tags'].str.contains(keywords,
                                                                case=False)].copy()  # Use .copy() to avoid SettingWithCopyWarning

    # Convert 'epoch' to datetime objects
    if 'epoch' in df.columns:
        # Ensure 'epoch' is numeric, coercing errors to NaT (Not a Time)
        df_selected['epoch'] = pd.to_datetime(df['epoch'], unit='s', errors='coerce')

    # Convert 'tags' list into a comma-separated string for easier use in SQL/CSV.
    if 'tags' in df_selected.columns:
        df_selected['tags_string'] = df_selected['tags'].apply(
            lambda tags_list: ', '.join(tags_list) if isinstance(tags_list, list) and tags_list else None
        )

    # Clean up HTML and robot message from description
    if 'description' in df.columns:
        df_selected['description'] = df['description'].apply(
            lambda html: bs(html, 'html.parser').get_text()
        )
        df_selected['description'] = df_selected['description'].str.replace(r'Please mention the word(.)*', "",
                                                                            regex=True)

    df_selected['source'] = 'REmote OK'

    return df_selected


def parse_jobicy_jobs_to_structured_df(data):
    """
    Parses a job dictionary (from API) into a pandas DataFrame.
    Selects relevant columns and performs basic data cleaning/transformation.

    Args:
        data (dict): A dictionary from the API.

    Returns:
        pandas.DataFrame: A DataFrame containing structured job data, or an empty DataFrame if input is invalid.
    """
    
    # The jobicy API returns a list with metadata.
    # Actual job listings are in the "job" dictionary.
    print(f"Friendly notice: {data.get('friendlyNotice')}")
    job_list = data.get('jobs', [])
    
    if not job_list or not isinstance(job_list, list):
        print("No job data provided or data is not in list format. Returning empty DataFrame.")
        return pd.DataFrame()

    print(f"Normalizing {len(job_list)} job entries into a DataFrame...")
    # Use pandas.json_normalize to flatten the JSON structures.
    df = pd.json_normalize(job_list)

    # --- Data Cleaning and Transformation ---

    # Define the columns we are interested in.
    desired_columns = [
        'id', 'company', 'position', 'tags', 'location', 'salary_min', 'salary_max'
    ]
    # Define the keywords to look for in job titles
    keywords = 'analy|data|machine learning|intelligence'

    # Select only the desired columns that are actually present in the DataFrame
    # This makes the script more robust to changes in the API response
    columns_to_select = [col for col in desired_columns if col in df.columns]

    if not columns_to_select:
        print("None of the desired columns were found in the API response. Returning empty DataFrame.")
        return pd.DataFrame()

    df_selected = df[columns_to_select][df['position'].str.contains(keywords, case=False) |
                                        df['tags'].str.contains(keywords,
                                                                case=False)].copy()  # Use .copy() to avoid SettingWithCopyWarning

    # Convert 'epoch' to datetime objects
    if 'epoch' in df.columns:
        # Ensure 'epoch' is numeric, coercing errors to NaT (Not a Time)
        df_selected['epoch'] = pd.to_datetime(df['epoch'], unit='s', errors='coerce')

    # Convert 'tags' list into a comma-separated string for easier use in SQL/CSV.
    if 'tags' in df_selected.columns:
        df_selected['tags_string'] = df_selected['tags'].apply(
            lambda tags_list: ', '.join(tags_list) if isinstance(tags_list, list) and tags_list else None
        )

    # Clean up HTML and robot message from description
    if 'description' in df.columns:
        df_selected['description'] = df['description'].apply(
            lambda html: bs(html, 'html.parser').get_text()
        )
        df_selected['description'] = df_selected['description'].str.replace(r'Please mention the word(.)*', "",
                                                                            regex=True)

    df_selected['source'] = 'REmote OK'

    return df_selected


def generate_wordcloud(text, mask_image_path=None):
    """
    Generates a word cloud from the provided text.

    Args:
        text (str): The text to generate the word cloud from.
        mask_image_path (str): Path to an image file to use as a mask for the word cloud.

    Returns:
        WordCloud: A WordCloud object.
    """
    if mask_image_path and path.exists(mask_image_path):
        mask = Image.open(mask_image_path)
        mask = mask.convert("L")  # Convert to grayscale
        mask_array = np.array(mask)
    else:
        mask_array = None

    wc = WordCloud(width=800, height=400,
                   background_color='white',
                   max_words=30,
                   stopwords=stopwords,
                   mask=mask_array,
                   contour_color='steelblue',
                   contour_width=1).generate(text)

    return wc


[nltk_data] Downloading package stopwords to /home/caddy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/caddy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/caddy/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
jobs_df = fetch_jobs_from_api(['remoteok'])
#jobs_df = parse_jobs_to_structured_dataframe(raw_job_data, 'Remote OK')
jobs_df

Attempting to fetch data from: https://remoteok.com/api
Skipping the first element (meta-data/legal): API Terms of Service: Please link back (with follow, and without nofollow!) to the URL on Remote OK and mention Remote OK as a source, so we get traffic back from your site. If you do not we'll have to suspend API access.

Please don't use the Remote OK logo without written permission as it's a registered trademark, please DO use our name Remote OK though.
Normalizing 95 job entries into a DataFrame...


Unnamed: 0,id,company,position,tags,location,salary_min,salary_max,epoch,tags_string,description,source
0,1093192,Ironclad,Manager Engineering II Data Pipelines,"[manager, design, hr, docker, technical, softw...",San Francisco,67500,97500,2025-05-17 13:00:04,"manager, design, hr, docker, technical, softwa...",Ironclad is the #1 contract lifecycle manageme...,REmote OK
9,1093176,Warner Music Inc.,Staff Machine Learning Engineer,"[music, design, system, training, software, cl...",,60000,97500,2025-05-15 00:00:20,"music, design, system, training, software, clo...","At Warner Music Group, weâre a global collec...",REmote OK
26,1093152,Fullscript,Senior Data Analytics Engineer,"[design, python, technical, support, testing, ...",,67500,120000,2025-05-10 21:00:03,"design, python, technical, support, testing, g...","At Fullscript, weâre not just changing healt...",REmote OK
48,1093124,Utility Profit + Sunroom,Data and Business Intelligence Engineer Mid Le...,"[design, saas, leader, operations, telecom, en...",,70000,120000,2025-05-07 08:00:15,"design, saas, leader, operations, telecom, eng...",About Utility ProfitUtility Profit is transfor...,REmote OK
64,1093098,500 WP Company LLC,Computational Journalist Data Reporting,"[embedded, training, software, director, devop...",DC-Washington-TWP Headquarters,57500,80000,2025-05-03 00:00:03,"embedded, training, software, director, devops...",Application Instructions Please list all profe...,REmote OK
86,1093062,KPA,Data Analyst,"[analyst, saas, salesforce, training, consulti...",Remote,70000,115000,2025-04-26 16:00:02,"analyst, saas, salesforce, training, consultin...","Founded in 1986, KPA is a leading provider of ...",REmote OK


In [None]:

# Generate word cloud using tags on job postings
tags = ", ".join(tag for tag in jobs_df.tags_string).lower()

wordcloud = generate_wordcloud(tags)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
from collections import Counter

description = " ".join(jobs_df.description).lower()

word_list = word_tokenize(" ".join(jobs_df.description).lower())
filtered_words = [word for word in word_list if word == 'r' or (len(word) > 2 and word not in stopwords)]

common_keywords = Counter(filtered_words).most_common(20) # Get the top 20 most frequent words

print("\nTop 20 most common keywords in job descriptions:")
for keyword, count in common_keywords:
     print(f"- {keyword}: {count}")


In [None]:
import pandas as pd  # For data manipulation and creating DataFrames
import requests  # For making HTTP requests to the API
from bs4 import BeautifulSoup as bs
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud, STOPWORDS


def fetch_remoteok_jobs_from_api():
    """
    Fetches job listings from the RemoteOK API.

    The RemoteOK API returns a list. The first element is often a legal notice or API information,
    so we skip it to get to the actual job listings.

    Returns:
        data: A list of job dictionaries if successful, None otherwise.
    """
    api_url = "https://remoteok.com/api"
    print(f"Attempting to fetch data from: {api_url}")
    try:
        response = requests.get(api_url, timeout=10)  # Added timeout for robustness
        response.raise_for_status()  # Raises an HTTPError for bad responses (4XX or 5XX)

        data = response.json()

        # The RemoteOK API returns a list. The first item is a "legal notice" or API info.
        # Actual job listings start from the second item.
        if isinstance(data, list) and len(data) > 0:
            if data[0].get("legal") is not None:
                print(f"Skipping the first element (meta-data/legal): {data[0].get('legal')}")
                return data[1:]  # Return the rest of the list
            else:
                # If the first element doesn't look like metadata, perhaps the API structure changed.
                # For now, we'll assume it's all job data.
                print("First element does not appear to be metadata. Processing all elements as data-jobs.")
                return data
        elif isinstance(data, list) and len(data) == 0:
            print("API returned an empty list of data-jobs.")
            return []
        else:
            print(f"Unexpected API response format. Expected a list, got {type(data)}.")
            return None

    except requests.exceptions.Timeout:
        print(f"Error: Request to {api_url} timed out.")
        return None
    except requests.exceptions.HTTPError as http_err:
        print(f"Error: HTTP error occurred: {http_err} - Status Code: {response.status_code}")
        return None
    except requests.exceptions.RequestException as req_err:
        print(f"Error: An error occurred while fetching data from API: {req_err}")
        return None
    except ValueError as json_err:  # Includes json.JSONDecodeError
        print(f"Error: Could not decode JSON response: {json_err}")
        return None


def fetch_jobicy_jobs_from_api():
    """
    Fetches job listings from the Jobicy API.

    The Jobicy API returns a list where the first element contains API information,
    so we skip it to get to the actual job listings.

    Returns:
        data: A list of job dictionaries if successful, None otherwise.
    """
    api_url = "https://jobicy.com/api/v2/remote-jobs"
    print(f"Attempting to fetch data from: {api_url}")
    try:
        response = requests.get(api_url, timeout=10)  # Added timeout for robustness
        response.raise_for_status()  # Raises an HTTPError for bad responses (4XX or 5XX)

        data = response.json()

        # The RemoteOK API returns a list. The first item is contains API info.
        # Actual job listings are in the second item.
        if isinstance(data, list) and len(data) > 0:
            if data[0].get("friendlyNotice") is not None:
                print(f"Skipping the first element (meta-data/legal): {data[0].get('friendlyNotice')}")
                return data[1].get('data-jobs')  # Return the job list
            else:
                # If the first element doesn't look like metadata, perhaps the API structure changed.
                # For now, we'll assume it's all job data.
                print("First element does not appear to be metadata. Processing all elements as data-jobs.")
                return data
        elif isinstance(data, list) and len(data) == 0:
            print("API returned an empty list of data-jobs.")
            return []
        else:
            print(f"Unexpected API response format. Expected a list, got {type(data)}.")
            return None

    except requests.exceptions.Timeout:
        print(f"Error: Request to {api_url} timed out.")
        return None
    except requests.exceptions.HTTPError as http_err:
        print(f"Error: HTTP error occurred: {http_err} - Status Code: {response.status_code}")
        return None
    except requests.exceptions.RequestException as req_err:
        print(f"Error: An error occurred while fetching data from API: {req_err}")
        return None
    except ValueError as json_err:  # Includes json.JSONDecodeError
        print(f"Error: Could not decode JSON response: {json_err}")
        return None


def parse_jobs_to_structured_dataframe(job_list):
    """
    Parses a list of job dictionaries (from API) into a pandas DataFrame.
    Selects relevant columns and performs basic data cleaning/transformation.

    Args:
        job_list (list): A list of dictionaries, where each dictionary represents a job.

    Returns:
        pandas.DataFrame: A DataFrame containing structured job data, or an empty DataFrame if input is invalid.
    """
    if not job_list or not isinstance(job_list, list):
        print("No job data provided or data is not in list format. Returning empty DataFrame.")
        return pd.DataFrame()

    print(f"Normalizing {len(job_list)} job entries into a DataFrame...")
    # Use pandas.json_normalize to flatten the JSON structures.
    df = pd.json_normalize(job_list)

    # --- Data Cleaning and Transformation ---

    # Define the columns we are interested in.
    desired_columns = [
        'id', 'company', 'position', 'tags', 'location', 'salary_min', 'salary_max'
    ]
    # Define the keywords to look for in job titles
    keywords = 'analy|data|machine learning|intelligence'

    # Select only the desired columns that are actually present in the DataFrame
    # This makes the script more robust to changes in the API response
    columns_to_select = [col for col in desired_columns if col in df.columns]

    if not columns_to_select:
        print("None of the desired columns were found in the API response. Returning empty DataFrame.")
        return pd.DataFrame()

    df_selected = df[columns_to_select][df['position'].str.contains(keywords, case=False) |
                                        df['tags'].str.contains(keywords,
                                                                case=False)].copy()  # Use .copy() to avoid SettingWithCopyWarning

    # Convert 'epoch' to datetime objects
    if 'epoch' in df.columns:
        # Ensure 'epoch' is numeric, coercing errors to NaT (Not a Time)
        df_selected['epoch'] = pd.to_datetime(df['epoch'], unit='s', errors='coerce')

    # Convert 'tags' list into a comma-separated string for easier use in SQL/CSV.
    if 'tags' in df_selected.columns:
        df_selected['tags_string'] = df_selected['tags'].apply(
            lambda tags_list: ', '.join(tags_list) if isinstance(tags_list, list) and tags_list else None
        )

    # Clean up HTML and robot message from description
    if 'description' in df.columns:
        df_selected['description'] = df['description'].apply(
            lambda html: bs(html, 'html.parser').get_text()
        )
        df_selected['description'] = df_selected['description'].str.replace(r'Please mention the word(.)*', "",
                                                                            regex=True)

    return df_selected


def analyze_job_data(job_postings):
    """Performs basic analysis on the fetched job listings."""

    all_descriptions = ", ".join(job_postings.description).lower()
    keywords = all_descriptions.split()
    common_keywords = Counter(keywords).most_common(20) # Get the top 20 most frequent words

    print("\nTop 20 most common keywords in job descriptions:")
    for keyword, count in common_keywords:
        if keyword not in STOPWORDS: # Basic stop word removal
            print(f"- {keyword}: {count}")

    # Generate word cloud using tags on job postings
    tags = ", ".join(job_postings.tags_string).lower()

    wordcloud = WordCloud().generate(tags)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()


print("--- Starting RemoteOK Job Data Pipeline ---")

# Step 1: Fetch job data from the API
raw_job_data = fetch_remoteok_jobs_from_api()
raw_job_data2 = fetch_jobicy_jobs_from_api()

if raw_job_data is None:
    print("Failed to fetch job data from remoteok. Exiting pipeline.")

if not raw_job_data:
    print("No job listings fetched from the API. Exiting pipeline.")

print(f"Successfully fetched {len(raw_job_data)} raw job entries.")

# Step 2: Parse and transform data into a pandas DataFrame
jobs_dataframe = parse_jobs_to_structured_dataframe(raw_job_data)

if jobs_dataframe.empty:
    print("DataFrame creation failed or resulted in an empty DataFrame. Exiting pipeline.")

print("\n--- DataFrame Information ---")
jobs_dataframe.info()

analyze_job_data(jobs_dataframe)

In [None]:
test = {'Name': 'Geeks', 1: [1, 2, 3, 4]} 
len(test)