In [None]:
import pandas as pd 
import numpy as np
from dotenv import load_dotenv
import os 
import re
from import_keywords import extract_keywords, clean_text, find_matched_keywords, match_transition, find_keywords_from_df


In [None]:
load_dotenv()

PROJECT_DATA_DIR = os.getenv('DATA_DIR')
SAVE_DIR = os.getenv('SAVE_DIR')

data_path = os.path.join(PROJECT_DATA_DIR, 'Agrifood/Agrifood_complete.csv')
keywords_data = os.path.join(PROJECT_DATA_DIR, 'Startup_Keywords.xlsx')
save_path = os.path.join(SAVE_DIR, 'agrifood_tagged.csv')

### Extract keywords per specificed ecoystem from excel workbook into a pd DataFrame

extract_keywords can be used to specifiy which keywords to be extracted from the startup_keywords workbook. A df is saved with columns 'transition', 'technology', '{ecosystem_column_name}'. 

The df stores 1 keyword per row, allowing for simple slicing and processing if needed. 



In [None]:
# Read in keywords workkbook. Extract keyword cells for required ecosystems. Save to a dataframe. 
df = pd.read_excel(keywords_data, sheet_name='PY_Keywords_update', engine='openpyxl')

df.columns = [col.strip() for col in df.columns] 

df['Transition'] = df['Transition'].ffill()

agri_keywords_df = extract_keywords(df, industry_col='Agri-food')  

In [None]:
#Load df to tag. Append text fields to single search_text column. Normalise. 
agri_companies = pd.read_csv(data_path)

text_cols = ['short_description', 'description']

agri_companies['search_text'] = (
    agri_companies[text_cols]
    .apply(lambda x: ' '.join(x.dropna().astype(str).str.strip()), axis=1)
    .apply(clean_text)
)

print(agri_companies['search_text'].head(10).values)

In [None]:
#Add category tags, apply keyword matching. For each keyword match respective tech category is appended. 
agri_companies['category_tags'] = agri_companies['search_text'].apply(lambda row: find_keywords_from_df(row, agri_keywords_df))

category_exploded = agri_companies.explode('category_tags')

category_distribution = category_exploded['category_tags'].value_counts()

print("Category tag distribution:")
print(category_distribution)

In [None]:
# Tag keywords. Append as comma-seperated entries.
keywords = agri_keywords_df.iloc[:, 2].dropna().astype(str).unique().tolist()

agri_companies['keywords'] = agri_companies['search_text'].apply(lambda row: find_matched_keywords(row, keywords))

In [None]:
# Match transitions from a green and digital dictionary constructed from keywords_df 
agri_companies['green'] = agri_companies['search_text'].apply(lambda txt: match_transition(txt, green_keywords))
agri_companies['digital'] = agri_companies['search_text'].apply(lambda txt: match_transition(txt, digital_keywords))


In [None]:
filtered_df = agri_companies[
    (agri_companies['keywords'].str.strip() != '') |
    (agri_companies['digital'].str.strip() != '') |
    (agri_companies['green'].str.strip() != '')
].copy()

In [None]:
filtered_df.to_csv(save_path, index=False)