# Example of searching Stata files that contain matching variable names

Loop over a directory tree containing Stata `.dta` files. Read the files into a pandas DataFrame and search for files that contain matching variable names. The result is a dictionary with the Stata filename as the key and the value . is the variable names as a list (either full or narrowed just to the matches we're interested in).

In [1]:
!pip install wget

Collecting wget
Installing collected packages: wget
Successfully installed wget-3.2


In [2]:
import wget, os, os.path
examples = "Stata.zip"
os.path.isfile(examples) and os.remove(examples) # remove file if alreday exists before (re-)downlodaing
wget.download('http://www.principlesofeconometrics.com/zip_files/Stata.zip') # just a random example of stata files from the interwebs to illustrate how to do this

'Stata.zip'

In [3]:
from zipfile import ZipFile
with ZipFile(examples, 'r') as zipObj:
    zipObj.extractall()

In [4]:
import fnmatch
import os
import os.path
import re

startdir = '.' # by default start "here" in current directory
#startdir = '/path/to/dta/files'
includes = ['*.dta'] # for files only
excludes = [] # for dirs and files

# transform glob patterns to regular expressions
includes = r'|'.join([fnmatch.translate(x) for x in includes])
excludes = r'|'.join([fnmatch.translate(x) for x in excludes]) or r'$.'

for root, dirs, files in os.walk(startdir):

    # exclude dirs
    dirs[:] = [os.path.join(root, d) for d in dirs]
    dirs[:] = [d for d in dirs if not re.match(excludes, d)]

    # exclude/include files
    files = [os.path.join(root, f) for f in files]
    files = [f for f in files if not re.match(excludes, f)]
    files = [f for f in files if re.match(includes, f)]

    for fname in files:
        print(fname)

././Stata/table-c4.dta
././Stata/surplus.dta
././Stata/meat.dta
././Stata/brumm.dta
././Stata/savings.dta
././Stata/unit.dta
././Stata/cespro.dta
././Stata/ivreg1.dta
././Stata/coal.dta
././Stata/sirmans.dta
././Stata/cola2.dta
././Stata/pub.dta
././Stata/beer.dta
././Stata/cobb.dta
././Stata/table2-2.dta
././Stata/cloth.dta
././Stata/consumption.dta
././Stata/sp.dta
././Stata/demo.dta
././Stata/inflation.dta
././Stata/ivreg2.dta
././Stata/alcohol.dta
././Stata/table-c2.dta
././Stata/term.dta
././Stata/music.dta
././Stata/wa-wheat.dta
././Stata/salary.dta
././Stata/robbery.dta
././Stata/var.dta
././Stata/liquor.dta
././Stata/pubexp.dta
././Stata/sheep.dta
././Stata/tobit.dta
././Stata/table-c3.dta
././Stata/newbroiler.dta
././Stata/house_starts.dta
././Stata/chard.dta
././Stata/airline.dta
././Stata/oscar.dta
././Stata/nels_small.dta
././Stata/usa.dta
././Stata/nls.dta
././Stata/olympics.dta
././Stata/medical.dta
././Stata/stockton2.dta
././Stata/nls_panel2.dta
././Stata/stockton.dta
.

In [5]:
import pandas as pd

In [6]:
# using a dictionary comprehension to iterate over all stata files to read as a pandas dataframe 
# and generate a dictionary with filename as key and variable list as values
# https://python-reference.readthedocs.io/en/latest/docs/comprehensions/dict_comprehension.html
allvars = {fname : list(pd.read_stata(fname)) for fname in files}
allvars

{'././Stata/table-c4.dta': ['y1',
  'y2',
  'y3',
  'y4',
  'y5',
  'y6',
  'y7',
  'y8',
  'y9',
  'y10'],
 '././Stata/surplus.dta': ['p', 'q'],
 '././Stata/meat.dta': ['QB', 'IN', 'PB', 'PL', 'PP'],
 '././Stata/brumm.dta': ['x', 'y', 'z', 'initial', 'poprate', 'inv', 'school'],
 '././Stata/savings.dta': ['savings', 'income', 'average_income'],
 '././Stata/unit.dta': ['w', 'x', 'y', 'z'],
 '././Stata/cespro.dta': ['k', 'l', 'q', 'r', 'w', 'p'],
 '././Stata/ivreg1.dta': ['x', 'e'],
 '././Stata/coal.dta': ['x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'y'],
 '././Stata/sirmans.dta': ['adjust',
  'fixrate',
  'margin',
  'yield',
  'points',
  'maturity',
  'networth'],
 '././Stata/cola2.dta': ['id',
  'pepsi',
  'sevenup',
  'coke',
  'pr_pepsi',
  'pr_7up',
  'pr_coke',
  'feat_pepsi',
  'feat_7up',
  'feat_coke',
  'disp_pepsi',
  'disp_7up',
  'disp_coke'],
 '././Stata/pub.dta': ['y', 's3', 's2', 's1', 'q', 'p'],
 '././Stata/beer.dta': ['q', 'pb', 'pl', 'pr', 'i'],
 '././Stata/cobb.dta': ['k',

In [7]:
# filter files to just the subset that matches the list of variables we are interested in
# and generate a dictionary with filename as key and the FULL variable list in that file
matches = ['id','x','y']
files_with_matching_variables = {fname : variables for fname,variables in allvars.items() if any(elem in matches for elem in variables)} 
files_with_matching_variables

{'././Stata/brumm.dta': ['x', 'y', 'z', 'initial', 'poprate', 'inv', 'school'],
 '././Stata/unit.dta': ['w', 'x', 'y', 'z'],
 '././Stata/ivreg1.dta': ['x', 'e'],
 '././Stata/coal.dta': ['x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'y'],
 '././Stata/cola2.dta': ['id',
  'pepsi',
  'sevenup',
  'coke',
  'pr_pepsi',
  'pr_7up',
  'pr_coke',
  'feat_pepsi',
  'feat_7up',
  'feat_coke',
  'disp_pepsi',
  'disp_7up',
  'disp_coke'],
 '././Stata/pub.dta': ['y', 's3', 's2', 's1', 'q', 'p'],
 '././Stata/table2-2.dta': ['x',
  'y1',
  'y2',
  'y3',
  'y4',
  'y5',
  'y6',
  'y7',
  'y8',
  'y9',
  'y10'],
 '././Stata/ivreg2.dta': ['x', 'y', 'z1', 'z2'],
 '././Stata/salary.dta': ['e', 't', 'x', 'y'],
 '././Stata/tobit.dta': ['y', 'x'],
 '././Stata/table-c3.dta': ['y'],
 '././Stata/newbroiler.dta': ['year',
  'q',
  'y',
  'p',
  'pb',
  'pcorn',
  'pf',
  'qprod',
  'lexpts',
  'popgro'],
 '././Stata/airline.dta': ['year', 'y', 'w', 'r', 'l', 'k'],
 '././Stata/nls.dta': ['id',
  'year',
  'lwage',
  'hou

In [8]:
# Similar to above, but we ignore any variables not in our match list using set intersection.
#
# filter files to just the subset that matches the list of variables we are interested in
# and generate a dictionary with filename as key and the NARROWED variable list in that file
matches = ['id','x','y']
files_with_matching_variables_filtered = {fname : set(matches).intersection(variables) for fname,variables in allvars.items() if any(elem in matches for elem in variables)} 
files_with_matching_variables_filtered

{'././Stata/brumm.dta': {'x', 'y'},
 '././Stata/unit.dta': {'x', 'y'},
 '././Stata/ivreg1.dta': {'x'},
 '././Stata/coal.dta': {'y'},
 '././Stata/cola2.dta': {'id'},
 '././Stata/pub.dta': {'y'},
 '././Stata/table2-2.dta': {'x'},
 '././Stata/ivreg2.dta': {'x', 'y'},
 '././Stata/salary.dta': {'x', 'y'},
 '././Stata/tobit.dta': {'x', 'y'},
 '././Stata/table-c3.dta': {'y'},
 '././Stata/newbroiler.dta': {'y'},
 '././Stata/airline.dta': {'y'},
 '././Stata/nls.dta': {'id'},
 '././Stata/medical.dta': {'id'},
 '././Stata/nls_panel2.dta': {'id'},
 '././Stata/ces.dta': {'y'},
 '././Stata/clothes.dta': {'x', 'y'},
 '././Stata/figureC-3.dta': {'y'},
 '././Stata/pro.dta': {'y'},
 '././Stata/demand.dta': {'y'},
 '././Stata/manuf.dta': {'y'},
 '././Stata/vec.dta': {'x', 'y'},
 '././Stata/toodyay.dta': {'y'},
 '././Stata/share.dta': {'y'},
 '././Stata/ch10.dta': {'x', 'y'},
 '././Stata/money.dta': {'y'},
 '././Stata/hip.dta': {'y'},
 '././Stata/broiler.dta': {'y'},
 '././Stata/hhsurvey.dta': {'x'},
 './