In [8]:
import re
from pathlib import Path
import pandas as pd

file_path = "cap.tex"   # ←ここだけ自分のTeXに変更

text = Path(file_path).read_text(encoding="utf-8", errors="replace")

# 「カンマの直後が半角スペースではない」(ただし末尾のカンマは除外)
pattern = re.compile(r".(?! )")

hits = []
for m in pattern.finditer(text):
    i = m.start()
    if i + 1 >= len(text):  # 末尾の "," は除外
        continue

    # 行番号・列番号（1始まり）
    line_no = text.count("\n", 0, i) + 1
    last_nl = text.rfind("\n", 0, i)
    col_no = i - (last_nl + 1) + 1

    # カンマ直前10文字
    start = max(0, i - 10)
    before10 = text[start:i]

    # 次の1文字（改行なども分かるように）
    next_ch = text[i + 1]

    hits.append({
        "line": line_no,
        "col": col_no,
        "pos": i,
        "before10": before10.replace("\n", "\\n").replace("\t", "\\t").replace("\r", "\\r"),
        "next_char": repr(next_ch),
    })


In [9]:
df = pd.DataFrame(hits)

In [10]:
print(df)

        line  col     pos      before10 next_char
0          1    1       0                     'd'
1          1    2       1             \       'o'
2          1    3       2            \d       'c'
3          1    4       3           \do       'u'
4          1    5       4          \doc       'm'
...      ...  ...     ...           ...       ...
103845  3386    9  118693  \n\n\end{doc       'm'
103846  3386   10  118694   \n\end{docu       'e'
103847  3386   11  118695    \end{docum       'n'
103848  3386   12  118696    end{docume       't'
103849  3386   13  118697    nd{documen       '}'

[103850 rows x 5 columns]


In [11]:
from pathlib import Path

out_dir = Path(".")  # 保存先（必要なら Path("results") などに）
out_dir.mkdir(exist_ok=True)

csv_path = out_dir / "comma_hits.csv"
df.sort_values(["line", "col"]).to_csv(csv_path, index=False, encoding="utf-8-sig")

## .を計算

In [12]:
import re
from pathlib import Path
import pandas as pd

file_path = "cap.tex"   # ←ここだけ変更

text = Path(file_path).read_text(encoding="utf-8", errors="replace")

# 「ピリオドの直後が半角スペースではない」
pattern = re.compile(r"\.(?! )")

hits = []
for m in pattern.finditer(text):
    i = m.start()
    if i + 1 >= len(text):  # 末尾の "." は除外
        continue

    line_no = text.count("\n", 0, i) + 1
    last_nl = text.rfind("\n", 0, i)
    col_no = i - (last_nl + 1) + 1

    start = max(0, i - 10)
    before10 = text[start:i]
    next_ch = text[i + 1]

    hits.append({
        "line": line_no,
        "col": col_no,
        "pos": i,
        "before10": before10.replace("\n", "\\n").replace("\t", "\\t").replace("\r", "\\r"),
        "next_char": repr(next_ch),
    })

df_dot = pd.DataFrame(hits).sort_values(["line", "col"])
df_dot


Unnamed: 0,line,col,pos,before10,next_char
0,98,16,1898,t{preamble,'t'
1,154,189,4577,使い果たすはずである,'\n'
2,177,63,6367,on}_{t+1},'\n'
3,179,152,6536,}_{t+1}$はi,'i'
4,179,154,6538,{t+1}$はi.i,'d'
...,...,...,...,...,...
477,3229,41,114587,Z\ \text{s,'t'
478,3229,43,114589,\text{s.t,'}'
479,3261,14,115586,d{bmatrix},'\n'
480,3281,14,116013,d{bmatrix},'\n'


In [13]:
from pathlib import Path

out_dir = Path(".")  # 保存先（必要なら Path("results") などに）
out_dir.mkdir(exist_ok=True)

csv_path = out_dir / "period_hits.csv"
df_dot.sort_values(["line", "col"]).to_csv(csv_path, index=False, encoding="utf-8-sig")