# This is an example of how to use the AutoReg module.

## 1. Install the required packages

In [1]:
!pip install git+https://github.com/duoduoyeah/AutoReg.git

Collecting git+https://github.com/duoduoyeah/AutoReg.git
  Cloning https://github.com/duoduoyeah/AutoReg.git to /tmp/pip-req-build-oj57fj0e
  Running command git clone --filter=blob:none --quiet https://github.com/duoduoyeah/AutoReg.git /tmp/pip-req-build-oj57fj0e
  Resolved https://github.com/duoduoyeah/AutoReg.git to commit 4672a3c41cc67133f82869b4c3a15461edc287ba
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting litellm (from auto_reg==0.1.0)
  Downloading litellm-1.59.2-py3-none-any.whl.metadata (36 kB)
Collecting linearmodels (from auto_reg==0.1.0)
  Downloading linearmodels-6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.9 kB)
Collecting langchain-openai (from auto_reg==0.1.0)
  Downloading langchain_openai-0.3.1-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain-core<0.4.0,>=0.3.29 (from langchain->auto_reg==0.1.0)
  Downloading langchain_core-0.3.31-py3-none-any.whl.metadata (6.3 kB)
Collecting tiktoken<1,>=0.7 (from langchain-opena

## 2. Add secret key in the colab environment
You need to add your OpenAI API key and base URL to Secrets in the colab environment.
After you add the secret key, you can run the following code to check if the key is added successfully.
You need to add the following keys:
- OPENAI_API_KEY
- OPENAI_API_BASE
- DEEPSEEK_API_KEY (Optional)
- DEEPSEEK_API_BASE (Optional)


In [2]:
from google.colab import userdata
userdata.get('OPENAI_API_KEY')[:5]

'sk-Am'

## 3. Setup data
For demonstration, we use the research_config.json file and csv file in the example folder in our github repository.

When you want to use this tool, upload your own csv file and modify the research_config.json file to follow your own research topic and csv data. You could use the following json and csv file as a template.

In [3]:
!wget https://raw.githubusercontent.com/duoduoyeah/AutoReg/main/examples/research_config.json
!wget https://raw.githubusercontent.com/duoduoyeah/AutoReg/main/test_data/example_data.csv
!ls research_config.json example_data.csv

--2025-01-22 05:42:47--  https://raw.githubusercontent.com/duoduoyeah/AutoReg/main/examples/research_config.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2410 (2.4K) [text/plain]
Saving to: ‘research_config.json’


2025-01-22 05:42:47 (39.6 MB/s) - ‘research_config.json’ saved [2410/2410]

--2025-01-22 05:42:47--  https://raw.githubusercontent.com/duoduoyeah/AutoReg/main/test_data/example_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 166892 (163K) [text/plain]
Saving to: ‘example_data.csv’


2025-01-22 05:42:48 (

In [4]:
import pandas as pd
from auto_reg.regression.regression_config import *
import json

file_path = 'example_data.csv'
df = pd.read_csv(file_path)
df = df.set_index(['company_id', 'year'])
# Remove rows with missing values
df = df.dropna()

def load_research_config(config_path: str) -> ResearchConfig:
    with open(config_path) as f:
        config_data = json.load(f)
    return ResearchConfig(**config_data)

research_config = load_research_config('research_config.json')
research_config.validate_research_config(df)
research_topic: str = research_config.research_topic
print(research_topic)

How extreme weather events affect company stock prices


## 4. Create Langchain models
When you only use OpenAI model, you could use the following code to create the model.
```python
from langchain_openai import ChatOpenAI

model_4o = ChatOpenAI(
    model_name="gpt-4o",
    timeout=(45.0), # 45 seconds before timeout
    temperature=0,
    api_key=userdata.get('OPENAI_API_KEY'),
    base_url=userdata.get('OPENAI_API_BASE')
)

# User need to: map different tasks to specific models
model: dict[str, ChatOpenAI] = {
    'design_model': model_4o,  # For designing table layouts
    'draw_model': model_4o,  # For drawing regression tables
    'analysis_model': model_4o,  # For analyzing regression results
}
```

In [5]:
from langchain_openai import ChatOpenAI

model_4o = ChatOpenAI(
    model_name="gpt-4o",
    timeout=(45.0), # 45 seconds before timeout
    temperature=0,
    api_key=userdata.get('OPENAI_API_KEY'),
    base_url=userdata.get('OPENAI_API_BASE')
)

# comment out the following code if you only use OpenAI model
model_deepseek = ChatOpenAI(
    model_name="deepseek-chat",
    temperature=0,
    timeout=(45.0),
    api_key=userdata.get('DEEPSEEK_API_KEY'),
    base_url=userdata.get('DEEPSEEK_API_BASE')
)

# User need to: map different tasks to specific models
model: dict[str, ChatOpenAI] = {
    'design_model': model_4o,  # For designing table layouts
    'draw_model': model_4o,  # For drawing regression tables
    'analysis_model': model_deepseek,  # For analyzing regression results
}

## 5. Run the regression analysis
The variable `regression_results` is a list of RegressionResult objects.

In [6]:
from auto_reg.regression.panel_data import run_regressions

regression_results = run_regressions(
    df,
    research_config.generate_regression_configs()
)
print(regression_results[0].description)
print(regression_results[0].regression_type)
print(regression_results[0].regression_config)

Two Basic regressions.with and without controls
 The first regression result is the one without controls
 The second regression result is the one with controls
panel_regression
dependent_vars: ['stock_revenue']
dependent_var_description: ["Company's abnormal stock returns, calculated as the difference between actual and expected returns"]
independent_vars: ['extreme_temperature']
independent_var_description: ['Frequency of extreme temperature events per year, standardized across all companies']
control_vars: ['company_size', 'company_age', 'company_distance_to_sea', 'rain_amount', 'dry_amount']
control_vars_description: ['Size of the company, measured by natural logarithm of total assets', 'Age of the company since establishment in years', 'Distance of company headquarters to nearest coastline in kilometers', "Annual rainfall amount in millimeters of company's location", "Number of dry days per year of company's location"]
constant: True
regression_type: basic regression, the dependent

The regression result data is stored in the `RegressionResult().results` as a list of PanelEffectsResults objects.

In [7]:
print(regression_results[0].results)

[                          PanelOLS Estimation Summary                           
Dep. Variable:          stock_revenue   R-squared:                        0.0047
Estimator:                   PanelOLS   R-squared (Between):              0.0573
No. Observations:                1000   R-squared (Within):               0.0200
Date:                Wed, Jan 22 2025   R-squared (Overall):              0.0242
Time:                        05:43:08   Log-likelihood                   -1341.3
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      4.2008
Entities:                         100   P-value                           0.0407
Avg Obs:                      10.0000   Distribution:                   F(1,890)
Min Obs:                      10.0000                                           
Max Obs:                      10.0000   F-statistic (robust):             3.4073
                           

## 6. Generate the regression table design
In this section, we first use a language model to design the regression tables.
The design process involves generating a layout for the tables based on the research topic
and the results of the regression analysis. The design_model helps in structuring the tables in a way that is
most informative and visually appealing.

In this example, we design 3 tables. You could modify the number of tables in the following code.
```python
number_of_tables = 6
```

In [8]:
from auto_reg.analysis.generate_table import *
from auto_reg.analysis.design import *

# Design regression tables
table_design: TableDesign|None = await design_regression_tables(
    research_topic,
    regression_results,
    model['design_model']
)

# if table_design is None, the design process failed.
assert table_design is not None

table_design = select_table_design(table_design, number_of_tables=3)
print(table_design)


Get valid table design
number_of_tables=3 table_index=[[0, 5], [1, 2], [3, 4]] table_regression_nums=[4, 2, 2] table_title=['Basic Regressions with and without controls & 2SLS Endogeneity Test', 'Robustness Tests: Alternative Independent and Dependent Variables', 'Robustness Tests: Alternative Fixed Effects and Extra Controls']


## 7. Generate the research report
After the design process, we draw the tables and analyze the results.
You could change the language used in the following code.
```python
language_used = "Chinese"
```


In [9]:

# draw tables
table_results = ResultTables()
await draw_tables(
    regression_results,
    table_design,
    model['draw_model'],
    table_results
)

# analyze results
await analyze_regression_results(
    regression_results,
    table_design,
    table_results,
    model['analysis_model'],
    language_used="English"
)

# combine tables
combined_table_results: ResultTables = await combine_tables(
    table_results,
    table_design,
    model['draw_model'])


## 8. Output Latex File
In the left folder bar, you will find a temp folder. The temp folder contains the following files:
- raw_regression_result.txt: the raw regression result of each regression.
- analysis.tex: the analysis of the regression result.

If you use Chinese, you may need to learn further how to use latex to output chinese.


In [10]:
import os
if not os.path.exists('temp'):
    os.makedirs('temp')

with open('temp/raw_regression_result.txt', 'w') as raw_file:
    for regression in regression_results:
        raw_file.write(str(regression) + '\n')

with open('temp/analysis.tex', 'w') as analysis_file:

    analysis_file.write("\\documentclass{article}\n")
    analysis_file.write("\\usepackage{graphicx} % Required for inserting images\n")
    analysis_file.write("\\usepackage{booktabs}\n")
    analysis_file.write("\\usepackage{threeparttable}\n")
    analysis_file.write("\\title{RegressFast}\n")
    analysis_file.write("\\begin{document}\n")

    for i in range(len(combined_table_results.analysis)):
        analysis_file.write(combined_table_results.analysis[i].analysis + '\n')
        analysis_file.write(combined_table_results.tables[i].latex_table + '\n')

    analysis_file.write("\\end{document}\n")